summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel
downloadlinux-sh-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile53
-rw-r--r--kernel/acct.c561
-rw-r--r--kernel/audit.c839
-rw-r--r--kernel/auditsc.c1015
-rw-r--r--kernel/capability.c220
-rw-r--r--kernel/compat.c860
-rw-r--r--kernel/configs.c118
-rw-r--r--kernel/cpu.c193
-rw-r--r--kernel/cpuset.c1564
-rw-r--r--kernel/dma.c158
-rw-r--r--kernel/exec_domain.c209
-rw-r--r--kernel/exit.c1527
-rw-r--r--kernel/extable.c67
-rw-r--r--kernel/fork.c1274
-rw-r--r--kernel/futex.c798
-rw-r--r--kernel/intermodule.c182
-rw-r--r--kernel/irq/Makefile5
-rw-r--r--kernel/irq/autoprobe.c189
-rw-r--r--kernel/irq/handle.c193
-rw-r--r--kernel/irq/internals.h18
-rw-r--r--kernel/irq/manage.c349
-rw-r--r--kernel/irq/proc.c159
-rw-r--r--kernel/irq/spurious.c96
-rw-r--r--kernel/itimer.c241
-rw-r--r--kernel/kallsyms.c411
-rw-r--r--kernel/kfifo.c168
-rw-r--r--kernel/kmod.c256
-rw-r--r--kernel/kprobes.c157
-rw-r--r--kernel/ksysfs.c57
-rw-r--r--kernel/kthread.c202
-rw-r--r--kernel/module.c2108
-rw-r--r--kernel/panic.c157
-rw-r--r--kernel/params.c721
-rw-r--r--kernel/pid.c292
-rw-r--r--kernel/posix-cpu-timers.c1559
-rw-r--r--kernel/posix-timers.c1584
-rw-r--r--kernel/power/Kconfig74
-rw-r--r--kernel/power/Makefile11
-rw-r--r--kernel/power/console.c58
-rw-r--r--kernel/power/disk.c431
-rw-r--r--kernel/power/main.c269
-rw-r--r--kernel/power/pm.c265
-rw-r--r--kernel/power/power.h52
-rw-r--r--kernel/power/poweroff.c45
-rw-r--r--kernel/power/process.c121
-rw-r--r--kernel/power/smp.c85
-rw-r--r--kernel/power/swsusp.c1433
-rw-r--r--kernel/printk.c996
-rw-r--r--kernel/profile.c563
-rw-r--r--kernel/ptrace.c389
-rw-r--r--kernel/rcupdate.c470
-rw-r--r--kernel/resource.c551
-rw-r--r--kernel/sched.c5004
-rw-r--r--kernel/seccomp.c56
-rw-r--r--kernel/signal.c2662
-rw-r--r--kernel/softirq.c496
-rw-r--r--kernel/spinlock.c371
-rw-r--r--kernel/stop_machine.c212
-rw-r--r--kernel/sys.c1725
-rw-r--r--kernel/sys_ni.c86
-rw-r--r--kernel/sysctl.c2337
-rw-r--r--kernel/time.c599
-rw-r--r--kernel/timer.c1611
-rw-r--r--kernel/uid16.c196
-rw-r--r--kernel/user.c189
-rw-r--r--kernel/wait.c246
-rw-r--r--kernel/workqueue.c555
67 files changed, 40718 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 000000000000..eb88b446c2cc
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,53 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+ exit.o itimer.o time.o softirq.o resource.o \
+ sysctl.o capability.o ptrace.o timer.o user.o \
+ signal.o sys.o kmod.o workqueue.o pid.o \
+ rcupdate.o intermodule.o extable.o params.o posix-timers.o \
+ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+
+obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKCONFIG_PROC) += configs.o
+obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_SECCOMP) += seccomp.o
+
+ifneq ($(CONFIG_IA64),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only. Why this used to be enabled for all architectures is beyond
+# me. I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+$(obj)/configs.o: $(obj)/config_data.h
+
+# config_data.h contains the same information as ikconfig.h but gzipped.
+# Info from config_data can be extracted from /proc/config*
+targets += config_data.gz
+$(obj)/config_data.gz: .config FORCE
+ $(call if_changed,gzip)
+
+quiet_cmd_ikconfiggz = IKCFG $@
+ cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+targets += config_data.h
+$(obj)/config_data.h: $(obj)/config_data.gz FORCE
+ $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
new file mode 100644
index 000000000000..4168f631868e
--- /dev/null
+++ b/kernel/acct.c
@@ -0,0 +1,561 @@
+/*
+ * linux/kernel/acct.c
+ *
+ * BSD Process Accounting for Linux
+ *
+ * Author: Marco van Wieringen <mvw@planets.elm.net>
+ *
+ * Some code based on ideas and code from:
+ * Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ *
+ * This file implements BSD-style process accounting. Whenever any
+ * process exits, an accounting record of type "struct acct" is
+ * written to the file specified with the acct() system call. It is
+ * up to user-level programs to do useful things with the accounting
+ * log. The kernel just provides the raw accounting information.
+ *
+ * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
+ *
+ * Plugged two leaks. 1) It didn't return acct_file into the free_filps if
+ * the file happened to be read-only. 2) If the accounting was suspended
+ * due to the lack of space it happily allowed to reopen it and completely
+ * lost the old acct_file. 3/10/98, Al Viro.
+ *
+ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
+ * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
+ *
+ * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * was suspeneded we failed to stop it on umount(). Messy.
+ * Another one: remount to readonly didn't stop accounting.
+ * Question: what should we do if we have CAP_SYS_ADMIN but not
+ * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
+ * unless we are messing with the root. In that case we are getting a
+ * real mess with do_remount_sb(). 9/11/98, AV.
+ *
+ * Fixed a bunch of races (and pair of leaks). Probably not the best way,
+ * but this one obviously doesn't introduce deadlocks. Later. BTW, found
+ * one race (and leak) in BSD implementation.
+ * OK, that's better. ANOTHER race and leak in BSD variant. There always
+ * is one more bug... 10/11/98, AV.
+ *
+ * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
+ * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * a struct file opened for write. Fixed. 2/6/2000, AV.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/vfs.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/blkdev.h> /* sector_div */
+
+/*
+ * These constants control the amount of freespace that suspend and
+ * resume the process accounting system, and the time delay between
+ * each check.
+ * Turned into sysctl-controllable parameters. AV, 12/11/98
+ */
+
+int acct_parm[3] = {4, 2, 30};
+#define RESUME (acct_parm[0]) /* >foo% free space - resume */
+#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
+#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+
+/*
+ * External references and all of the globals.
+ */
+static void do_acct_process(long, struct file *);
+
+/*
+ * This structure is used so that all the data protected by lock
+ * can be placed in the same cache line as the lock. This primes
+ * the cache line to have the data after getting the lock.
+ */
+struct acct_glbs {
+ spinlock_t lock;
+ volatile int active;
+ volatile int needcheck;
+ struct file *file;
+ struct timer_list timer;
+};
+
+static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+
+/*
+ * Called whenever the timer says to check the free space.
+ */
+static void acct_timeout(unsigned long unused)
+{
+ acct_globals.needcheck = 1;
+}
+
+/*
+ * Check the amount of free space and suspend/resume accordingly.
+ */
+static int check_free_space(struct file *file)
+{
+ struct kstatfs sbuf;
+ int res;
+ int act;
+ sector_t resume;
+ sector_t suspend;
+
+ spin_lock(&acct_globals.lock);
+ res = acct_globals.active;
+ if (!file || !acct_globals.needcheck)
+ goto out;
+ spin_unlock(&acct_globals.lock);
+
+ /* May block */
+ if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+ return res;
+ suspend = sbuf.f_blocks * SUSPEND;
+ resume = sbuf.f_blocks * RESUME;
+
+ sector_div(suspend, 100);
+ sector_div(resume, 100);
+
+ if (sbuf.f_bavail <= suspend)
+ act = -1;
+ else if (sbuf.f_bavail >= resume)
+ act = 1;
+ else
+ act = 0;
+
+ /*
+ * If some joker switched acct_globals.file under us we'ld better be
+ * silent and _not_ touch anything.
+ */
+ spin_lock(&acct_globals.lock);
+ if (file != acct_globals.file) {
+ if (act)
+ res = act>0;
+ goto out;
+ }
+
+ if (acct_globals.active) {
+ if (act < 0) {
+ acct_globals.active = 0;
+ printk(KERN_INFO "Process accounting paused\n");
+ }
+ } else {
+ if (act > 0) {
+ acct_globals.active = 1;
+ printk(KERN_INFO "Process accounting resumed\n");
+ }
+ }
+
+ del_timer(&acct_globals.timer);
+ acct_globals.needcheck = 0;
+ acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct_globals.timer);
+ res = acct_globals.active;
+out:
+ spin_unlock(&acct_globals.lock);
+ return res;
+}
+
+/*
+ * Close the old accouting file (if currently open) and then replace
+ * it with file (if non-NULL).
+ *
+ * NOTE: acct_globals.lock MUST be held on entry and exit.
+ */
+static void acct_file_reopen(struct file *file)
+{
+ struct file *old_acct = NULL;
+
+ if (acct_globals.file) {
+ old_acct = acct_globals.file;
+ del_timer(&acct_globals.timer);
+ acct_globals.active = 0;
+ acct_globals.needcheck = 0;
+ acct_globals.file = NULL;
+ }
+ if (file) {
+ acct_globals.file = file;
+ acct_globals.needcheck = 0;
+ acct_globals.active = 1;
+ /* It's been deleted if it was used before so this is safe */
+ init_timer(&acct_globals.timer);
+ acct_globals.timer.function = acct_timeout;
+ acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct_globals.timer);
+ }
+ if (old_acct) {
+ spin_unlock(&acct_globals.lock);
+ do_acct_process(0, old_acct);
+ filp_close(old_acct, NULL);
+ spin_lock(&acct_globals.lock);
+ }
+}
+
+/*
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
+ */
+asmlinkage long sys_acct(const char __user *name)
+{
+ struct file *file = NULL;
+ char *tmp;
+ int error;
+
+ if (!capable(CAP_SYS_PACCT))
+ return -EPERM;
+
+ if (name) {
+ tmp = getname(name);
+ if (IS_ERR(tmp)) {
+ return (PTR_ERR(tmp));
+ }
+ /* Difference from BSD - they don't do O_APPEND */
+ file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+ putname(tmp);
+ if (IS_ERR(file)) {
+ return (PTR_ERR(file));
+ }
+ if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+ filp_close(file, NULL);
+ return (-EACCES);
+ }
+
+ if (!file->f_op->write) {
+ filp_close(file, NULL);
+ return (-EIO);
+ }
+ }
+
+ error = security_acct(file);
+ if (error) {
+ if (file)
+ filp_close(file, NULL);
+ return error;
+ }
+
+ spin_lock(&acct_globals.lock);
+ acct_file_reopen(file);
+ spin_unlock(&acct_globals.lock);
+
+ return (0);
+}
+
+/*
+ * If the accouting is turned on for a file in the filesystem pointed
+ * to by sb, turn accouting off.
+ */
+void acct_auto_close(struct super_block *sb)
+{
+ spin_lock(&acct_globals.lock);
+ if (acct_globals.file &&
+ acct_globals.file->f_dentry->d_inode->i_sb == sb) {
+ acct_file_reopen((struct file *)NULL);
+ }
+ spin_unlock(&acct_globals.lock);
+}
+
+/*
+ * encode an unsigned long into a comp_t
+ *
+ * This routine has been adopted from the encode_comp_t() function in
+ * the kern_acct.c file of the FreeBSD operating system. The encoding
+ * is a 13-bit fraction with a 3-bit (base 8) exponent.
+ */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t encode_comp_t(unsigned long value)
+{
+ int exp, rnd;
+
+ exp = rnd = 0;
+ while (value > MAXFRACT) {
+ rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
+ value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT)) {
+ value >>= EXPSIZE;
+ exp++;
+ }
+
+ /*
+ * Clean it up and polish it off.
+ */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += value; /* and add on the mantissa. */
+ return exp;
+}
+
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+/*
+ * encode an u64 into a comp2_t (24 bits)
+ *
+ * Format: 5 bit base 2 exponent, 20 bits mantissa.
+ * The leading bit of the mantissa is not stored, but implied for
+ * non-zero exponents.
+ * Largest encodable value is 50 bits.
+ */
+
+#define MANTSIZE2 20 /* 20 bit mantissa. */
+#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
+#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
+#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
+
+static comp2_t encode_comp2_t(u64 value)
+{
+ int exp, rnd;
+
+ exp = (value > (MAXFRACT2>>1));
+ rnd = 0;
+ while (value > MAXFRACT2) {
+ rnd = value & 1;
+ value >>= 1;
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT2)) {
+ value >>= 1;
+ exp++;
+ }
+
+ if (exp > MAXEXP2) {
+ /* Overflow. Return largest representable number instead. */
+ return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
+ } else {
+ return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
+ }
+}
+#endif
+
+#if ACCT_VERSION==3
+/*
+ * encode an u64 into a 32 bit IEEE float
+ */
+static u32 encode_float(u64 value)
+{
+ unsigned exp = 190;
+ unsigned u;
+
+ if (value==0) return 0;
+ while ((s64)value > 0){
+ value <<= 1;
+ exp--;
+ }
+ u = (u32)(value >> 40) & 0x7fffffu;
+ return u | (exp << 23);
+}
+#endif
+
+/*
+ * Write an accounting entry for an exiting process
+ *
+ * The acct_process() call is the workhorse of the process
+ * accounting system. The struct acct is built here and then written
+ * into the accounting file. This function should only be called from
+ * do_exit().
+ */
+
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(long exitcode, struct file *file)
+{
+ acct_t ac;
+ mm_segment_t fs;
+ unsigned long vsize;
+ unsigned long flim;
+ u64 elapsed;
+ u64 run_time;
+ struct timespec uptime;
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(file))
+ return;
+
+ /*
+ * Fill the accounting struct with the needed info as recorded
+ * by the different kernel functions.
+ */
+ memset((caddr_t)&ac, 0, sizeof(acct_t));
+
+ ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+
+ /* calculate run_time in nsec*/
+ do_posix_clock_monotonic_gettime(&uptime);
+ run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+ run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+ + current->start_time.tv_nsec;
+ /* convert nsec -> AHZ */
+ elapsed = nsec_to_AHZ(run_time);
+#if ACCT_VERSION==3
+ ac.ac_etime = encode_float(elapsed);
+#else
+ ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+ {
+ /* new enlarged etime field */
+ comp2_t etime = encode_comp2_t(elapsed);
+ ac.ac_etime_hi = etime >> 16;
+ ac.ac_etime_lo = (u16) etime;
+ }
+#endif
+ do_div(elapsed, AHZ);
+ ac.ac_btime = xtime.tv_sec - elapsed;
+ ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
+ current->signal->utime +
+ current->group_leader->utime));
+ ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
+ current->signal->stime +
+ current->group_leader->stime));
+ /* we really need to bite the bullet and change layout */
+ ac.ac_uid = current->uid;
+ ac.ac_gid = current->gid;
+#if ACCT_VERSION==2
+ ac.ac_ahz = AHZ;
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+ /* backward-compatible 16 bit fields */
+ ac.ac_uid16 = current->uid;
+ ac.ac_gid16 = current->gid;
+#endif
+#if ACCT_VERSION==3
+ ac.ac_pid = current->tgid;
+ ac.ac_ppid = current->parent->tgid;
+#endif
+
+ read_lock(&tasklist_lock); /* pin current->signal */
+ ac.ac_tty = current->signal->tty ?
+ old_encode_dev(tty_devnum(current->signal->tty)) : 0;
+ read_unlock(&tasklist_lock);
+
+ ac.ac_flag = 0;
+ if (current->flags & PF_FORKNOEXEC)
+ ac.ac_flag |= AFORK;
+ if (current->flags & PF_SUPERPRIV)
+ ac.ac_flag |= ASU;
+ if (current->flags & PF_DUMPCORE)
+ ac.ac_flag |= ACORE;
+ if (current->flags & PF_SIGNALED)
+ ac.ac_flag |= AXSIG;
+
+ vsize = 0;
+ if (current->mm) {
+ struct vm_area_struct *vma;
+ down_read(&current->mm->mmap_sem);
+ vma = current->mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ up_read(&current->mm->mmap_sem);
+ }
+ vsize = vsize / 1024;
+ ac.ac_mem = encode_comp_t(vsize);
+ ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
+ ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
+ ac.ac_minflt = encode_comp_t(current->signal->min_flt +
+ current->group_leader->min_flt);
+ ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
+ current->group_leader->maj_flt);
+ ac.ac_swaps = encode_comp_t(0);
+ ac.ac_exitcode = exitcode;
+
+ /*
+ * Kernel segment override to datasegment and write it
+ * to the accounting file.
+ */
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ /*
+ * Accounting records are not subject to resource limits.
+ */
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ file->f_op->write(file, (char *)&ac,
+ sizeof(acct_t), &file->f_pos);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+ set_fs(fs);
+}
+
+/*
+ * acct_process - now just a wrapper around do_acct_process
+ */
+void acct_process(long exitcode)
+{
+ struct file *file = NULL;
+
+ /*
+ * accelerate the common fastpath:
+ */
+ if (!acct_globals.file)
+ return;
+
+ spin_lock(&acct_globals.lock);
+ file = acct_globals.file;
+ if (unlikely(!file)) {
+ spin_unlock(&acct_globals.lock);
+ return;
+ }
+ get_file(file);
+ spin_unlock(&acct_globals.lock);
+
+ do_acct_process(exitcode, file);
+ fput(file);
+}
+
+
+/*
+ * acct_update_integrals
+ * - update mm integral fields in task_struct
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ if (likely(tsk->mm)) {
+ long delta = tsk->stime - tsk->acct_stimexpd;
+
+ if (delta == 0)
+ return;
+ tsk->acct_stimexpd = tsk->stime;
+ tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+ }
+}
+
+/*
+ * acct_clear_integrals
+ * - clear the mm integral fields in task_struct
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+ if (tsk) {
+ tsk->acct_stimexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
+ }
+}
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..0f84dd7af2c8
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,839 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ * 2) Minimal run-time overhead:
+ * a) Minimal when syscall auditing is disabled (audit_enable=0).
+ * b) Small when syscall auditing is enabled and no audit record
+ * is generated (defer as much work as possible to record
+ * generation time):
+ * i) context is allocated,
+ * ii) names from getname are stored without a copy, and
+ * iii) inode information stored from path_lookup.
+ * 3) Ability to disable syscall auditing at boot time (audit=0).
+ * 4) Usable by other parts of the kernel (if audit_log* is called,
+ * then a syscall record will be generated automatically for the
+ * current syscall).
+ * 5) Netlink interface to user-space.
+ * 6) Support low-overhead kernel-based filtering to minimize the
+ * information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second. This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int audit_backlog_limit = 64;
+static atomic_t audit_backlog = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+ 0) [suppressed in audit_alloc]
+ 1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+ 2) out of memory in audit_log_move [alloc_skb]
+ 3) suppressed due to audit_rate_limit
+ 4) suppressed due to audit_backlog_limit
+*/
+static atomic_t audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers. The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static DEFINE_SPINLOCK(audit_txlist_lock);
+static DEFINE_SPINLOCK(audit_freelist_lock);
+static int audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records. Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record. The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue. Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+ struct list_head list;
+ struct sk_buff_head sklist; /* formatted skbs ready to send */
+ struct audit_context *ctx; /* NULL or associated context */
+ int len; /* used area of tmp */
+ char tmp[AUDIT_BUFSIZ];
+
+ /* Pointer to header and contents */
+ struct nlmsghdr *nlh;
+ int total;
+ int type;
+ int pid;
+ int count; /* Times requeued */
+};
+
+void audit_set_type(struct audit_buffer *ab, int type)
+{
+ ab->type = type;
+}
+
+struct audit_entry {
+ struct list_head list;
+ struct audit_rule rule;
+};
+
+static void audit_log_end_irq(struct audit_buffer *ab);
+static void audit_log_end_fast(struct audit_buffer *ab);
+
+static void audit_panic(const char *message)
+{
+ switch (audit_failure)
+ {
+ case AUDIT_FAIL_SILENT:
+ break;
+ case AUDIT_FAIL_PRINTK:
+ printk(KERN_ERR "audit: %s\n", message);
+ break;
+ case AUDIT_FAIL_PANIC:
+ panic("audit: %s\n", message);
+ break;
+ }
+}
+
+static inline int audit_rate_check(void)
+{
+ static unsigned long last_check = 0;
+ static int messages = 0;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ unsigned long now;
+ unsigned long elapsed;
+ int retval = 0;
+
+ if (!audit_rate_limit) return 1;
+
+ spin_lock_irqsave(&lock, flags);
+ if (++messages < audit_rate_limit) {
+ retval = 1;
+ } else {
+ now = jiffies;
+ elapsed = now - last_check;
+ if (elapsed > HZ) {
+ last_check = now;
+ messages = 0;
+ retval = 1;
+ }
+ }
+ spin_unlock_irqrestore(&lock, flags);
+
+ return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+ static unsigned long last_msg = 0;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ unsigned long now;
+ int print;
+
+ atomic_inc(&audit_lost);
+
+ print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+ if (!print) {
+ spin_lock_irqsave(&lock, flags);
+ now = jiffies;
+ if (now - last_msg > HZ) {
+ print = 1;
+ last_msg = now;
+ }
+ spin_unlock_irqrestore(&lock, flags);
+ }
+
+ if (print) {
+ printk(KERN_WARNING
+ "audit: audit_lost=%d audit_backlog=%d"
+ " audit_rate_limit=%d audit_backlog_limit=%d\n",
+ atomic_read(&audit_lost),
+ atomic_read(&audit_backlog),
+ audit_rate_limit,
+ audit_backlog_limit);
+ audit_panic(message);
+ }
+
+}
+
+static int audit_set_rate_limit(int limit)
+{
+ int old = audit_rate_limit;
+ audit_rate_limit = limit;
+ audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+ audit_rate_limit, old);
+ return old;
+}
+
+static int audit_set_backlog_limit(int limit)
+{
+ int old = audit_backlog_limit;
+ audit_backlog_limit = limit;
+ audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+ audit_backlog_limit, old);
+ return old;
+}
+
+static int audit_set_enabled(int state)
+{
+ int old = audit_enabled;
+ if (state != 0 && state != 1)
+ return -EINVAL;
+ audit_enabled = state;
+ audit_log(current->audit_context, "audit_enabled=%d old=%d",
+ audit_enabled, old);
+ return old;
+}
+
+static int audit_set_failure(int state)
+{
+ int old = audit_failure;
+ if (state != AUDIT_FAIL_SILENT
+ && state != AUDIT_FAIL_PRINTK
+ && state != AUDIT_FAIL_PANIC)
+ return -EINVAL;
+ audit_failure = state;
+ audit_log(current->audit_context, "audit_failure=%d old=%d",
+ audit_failure, old);
+ return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+ void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int len = NLMSG_SPACE(size);
+ void *data;
+ int flags = multi ? NLM_F_MULTI : 0;
+ int t = done ? NLMSG_DONE : type;
+
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ goto nlmsg_failure;
+
+ nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+ nlh->nlmsg_flags = flags;
+ data = NLMSG_DATA(nlh);
+ memcpy(data, payload, size);
+ netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+ return;
+
+nlmsg_failure: /* Used by NLMSG_PUT */
+ if (skb)
+ kfree_skb(skb);
+}
+
+/*
+ * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
+ * control messages.
+ */
+static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
+{
+ int err = 0;
+
+ switch (msg_type) {
+ case AUDIT_GET:
+ case AUDIT_LIST:
+ case AUDIT_SET:
+ case AUDIT_ADD:
+ case AUDIT_DEL:
+ if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
+ err = -EPERM;
+ break;
+ case AUDIT_USER:
+ if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
+ err = -EPERM;
+ break;
+ default: /* bad msg */
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ u32 uid, pid, seq;
+ void *data;
+ struct audit_status *status_get, status_set;
+ int err;
+ struct audit_buffer *ab;
+ u16 msg_type = nlh->nlmsg_type;
+
+ err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
+ if (err)
+ return err;
+
+ pid = NETLINK_CREDS(skb)->pid;
+ uid = NETLINK_CREDS(skb)->uid;
+ seq = nlh->nlmsg_seq;
+ data = NLMSG_DATA(nlh);
+
+ switch (msg_type) {
+ case AUDIT_GET:
+ status_set.enabled = audit_enabled;
+ status_set.failure = audit_failure;
+ status_set.pid = audit_pid;
+ status_set.rate_limit = audit_rate_limit;
+ status_set.backlog_limit = audit_backlog_limit;
+ status_set.lost = atomic_read(&audit_lost);
+ status_set.backlog = atomic_read(&audit_backlog);
+ audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
+ &status_set, sizeof(status_set));
+ break;
+ case AUDIT_SET:
+ if (nlh->nlmsg_len < sizeof(struct audit_status))
+ return -EINVAL;
+ status_get = (struct audit_status *)data;
+ if (status_get->mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(status_get->enabled);
+ if (err < 0) return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(status_get->failure);
+ if (err < 0) return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_PID) {
+ int old = audit_pid;
+ audit_pid = status_get->pid;
+ audit_log(current->audit_context,
+ "audit_pid=%d old=%d", audit_pid, old);
+ }
+ if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+ audit_set_rate_limit(status_get->rate_limit);
+ if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+ audit_set_backlog_limit(status_get->backlog_limit);
+ break;
+ case AUDIT_USER:
+ ab = audit_log_start(NULL);
+ if (!ab)
+ break; /* audit_panic has been called */
+ audit_log_format(ab,
+ "user pid=%d uid=%d length=%d msg='%.1024s'",
+ pid, uid,
+ (int)(nlh->nlmsg_len
+ - ((char *)data - (char *)nlh)),
+ (char *)data);
+ ab->type = AUDIT_USER;
+ ab->pid = pid;
+ audit_log_end(ab);
+ break;
+ case AUDIT_ADD:
+ case AUDIT_DEL:
+ if (nlh->nlmsg_len < sizeof(struct audit_rule))
+ return -EINVAL;
+ /* fallthrough */
+ case AUDIT_LIST:
+#ifdef CONFIG_AUDITSYSCALL
+ err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+ uid, seq, data);
+#else
+ err = -EOPNOTSUPP;
+#endif
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb). Each message is
+ * processed by audit_receive_msg. Malformed skbs with wrong length are
+ * discarded silently. */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+ u32 rlen;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return 0;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ if ((err = audit_receive_msg(skb, nlh))) {
+ netlink_ack(skb, nlh, err);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+ return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+ struct sk_buff *skb;
+
+ if (down_trylock(&audit_netlink_sem))
+ return;
+
+ /* FIXME: this must not cause starvation */
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ if (audit_receive_skb(skb) && skb->len)
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ else
+ kfree_skb(skb);
+ }
+ up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb. This is an extra copy, and
+ * that is unfortunate. However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation. (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+ struct sk_buff *skb;
+ char *start;
+ int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+ /* possible resubmission */
+ if (ab->len == 0)
+ return;
+
+ skb = skb_peek(&ab->sklist);
+ if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+ skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+ if (!skb) {
+ ab->len = 0; /* Lose information in ab->tmp */
+ audit_log_lost("out of memory in audit_log_move");
+ return;
+ }
+ __skb_queue_tail(&ab->sklist, skb);
+ if (!ab->nlh)
+ ab->nlh = (struct nlmsghdr *)skb_put(skb,
+ NLMSG_SPACE(0));
+ }
+ start = skb_put(skb, ab->len);
+ memcpy(start, ab->tmp, ab->len);
+ ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&ab->sklist))) {
+ int retval = 0;
+
+ if (audit_pid) {
+ if (ab->nlh) {
+ ab->nlh->nlmsg_len = ab->total;
+ ab->nlh->nlmsg_type = ab->type;
+ ab->nlh->nlmsg_flags = 0;
+ ab->nlh->nlmsg_seq = 0;
+ ab->nlh->nlmsg_pid = ab->pid;
+ }
+ skb_get(skb); /* because netlink_* frees */
+ retval = netlink_unicast(audit_sock, skb, audit_pid,
+ MSG_DONTWAIT);
+ }
+ if (retval == -EAGAIN && ab->count < 5) {
+ ++ab->count;
+ skb_queue_tail(&ab->sklist, skb);
+ audit_log_end_irq(ab);
+ return 1;
+ }
+ if (retval < 0) {
+ if (retval == -ECONNREFUSED) {
+ printk(KERN_ERR
+ "audit: *NO* daemon at audit_pid=%d\n",
+ audit_pid);
+ audit_pid = 0;
+ } else
+ audit_log_lost("netlink socket too busy");
+ }
+ if (!audit_pid) { /* No daemon */
+ int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+ int len = skb->len - offset;
+ printk(KERN_ERR "%*.*s\n",
+ len, len, skb->data + offset);
+ }
+ kfree_skb(skb);
+ ab->nlh = NULL;
+ }
+ return 0;
+}
+
+/* Initialize audit support at boot time. */
+static int __init audit_init(void)
+{
+ printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+ if (!audit_sock)
+ audit_panic("cannot initialize netlink socket");
+
+ audit_initialized = 1;
+ audit_enabled = audit_default;
+ audit_log(NULL, "initialized");
+ return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+ printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+ ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+ return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+ printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+ audit_sock = NULL;
+ audit_pid = 0;
+
+ audit_initialized = 1;
+ audit_enabled = audit_default;
+ audit_log(NULL, "initialized");
+ return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+ audit_default = !!simple_strtol(str, NULL, 0);
+ printk(KERN_INFO "audit: %s%s\n",
+ audit_default ? "enabled" : "disabled",
+ audit_initialized ? "" : " (after initialization)");
+ if (audit_initialized)
+ audit_enabled = audit_default;
+ return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+ struct audit_buffer *ab = NULL;
+ unsigned long flags;
+ struct timespec t;
+ int serial = 0;
+
+ if (!audit_initialized)
+ return NULL;
+
+ if (audit_backlog_limit
+ && atomic_read(&audit_backlog) > audit_backlog_limit) {
+ if (audit_rate_check())
+ printk(KERN_WARNING
+ "audit: audit_backlog=%d > "
+ "audit_backlog_limit=%d\n",
+ atomic_read(&audit_backlog),
+ audit_backlog_limit);
+ audit_log_lost("backlog limit exceeded");
+ return NULL;
+ }
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (!list_empty(&audit_freelist)) {
+ ab = list_entry(audit_freelist.next,
+ struct audit_buffer, list);
+ list_del(&ab->list);
+ --audit_freelist_count;
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+ if (!ab)
+ ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+ if (!ab) {
+ audit_log_lost("out of memory in audit_log_start");
+ return NULL;
+ }
+
+ atomic_inc(&audit_backlog);
+ skb_queue_head_init(&ab->sklist);
+
+ ab->ctx = ctx;
+ ab->len = 0;
+ ab->nlh = NULL;
+ ab->total = 0;
+ ab->type = AUDIT_KERNEL;
+ ab->pid = 0;
+ ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (ab->ctx)
+ audit_get_stamp(ab->ctx, &t, &serial);
+ else
+#endif
+ t = CURRENT_TIME;
+
+ audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+ t.tv_sec, t.tv_nsec/1000000, serial);
+ return ab;
+}
+
+
+/* Format an audit message into the audit buffer. If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time. Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+ va_list args)
+{
+ int len, avail;
+
+ if (!ab)
+ return;
+
+ avail = sizeof(ab->tmp) - ab->len;
+ if (avail <= 0) {
+ audit_log_move(ab);
+ avail = sizeof(ab->tmp) - ab->len;
+ }
+ len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ if (len >= avail) {
+ /* The printk buffer is 1024 bytes long, so if we get
+ * here and AUDIT_BUFSIZ is at least 1024, then we can
+ * log everything that printk could have logged. */
+ audit_log_move(ab);
+ avail = sizeof(ab->tmp) - ab->len;
+ len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ }
+ ab->len += (len < avail) ? len : avail;
+ ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer. All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!ab)
+ return;
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+ struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+ char *p;
+ int len, avail;
+
+ if (prefix) audit_log_format(ab, " %s", prefix);
+
+ if (ab->len > 128)
+ audit_log_move(ab);
+ avail = sizeof(ab->tmp) - ab->len;
+ p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+ if (IS_ERR(p)) {
+ /* FIXME: can we save some information here? */
+ audit_log_format(ab, "<toolong>");
+ } else {
+ /* path isn't at start of buffer */
+ len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+ memmove(ab->tmp + ab->len, p, len);
+ ab->len += len;
+ ab->total += len;
+ }
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+ LIST_HEAD(list);
+ struct audit_buffer *ab;
+ unsigned long flags;
+
+ spin_lock_irqsave(&audit_txlist_lock, flags);
+ list_splice_init(&audit_txlist, &list);
+ spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+ while (!list_empty(&list)) {
+ ab = list_entry(list.next, struct audit_buffer, list);
+ list_del(&ab->list);
+ audit_log_end_fast(ab);
+ }
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context. May be called in
+ * any context. */
+static void audit_log_end_irq(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ if (!ab)
+ return;
+ spin_lock_irqsave(&audit_txlist_lock, flags);
+ list_add_tail(&ab->list, &audit_txlist);
+ spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+ tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space. May not
+ * be called in an irq context. */
+static void audit_log_end_fast(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ BUG_ON(in_irq());
+ if (!ab)
+ return;
+ if (!audit_rate_check()) {
+ audit_log_lost("rate limit exceeded");
+ } else {
+ audit_log_move(ab);
+ if (audit_log_drain(ab))
+ return;
+ }
+
+ atomic_dec(&audit_backlog);
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (++audit_freelist_count > AUDIT_MAXFREE)
+ kfree(ab);
+ else
+ list_add(&ab->list, &audit_freelist);
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context. (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+ if (in_irq())
+ audit_log_end_irq(ab);
+ else
+ audit_log_end_fast(ab);
+}
+
+/* Log an audit record. This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end. It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+ struct audit_buffer *ab;
+ va_list args;
+
+ ab = audit_log_start(ctx);
+ if (ab) {
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+ audit_log_end(ab);
+ }
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..8c454852d6a5
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,1015 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+ 1 = put_count checking
+ 2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES 20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+ a per-task filter. At syscall entry, the audit_state is augmented by
+ the syscall filter. */
+enum audit_state {
+ AUDIT_DISABLED, /* Do not create per-task audit_context.
+ * No syscall-specific audit records can
+ * be generated. */
+ AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
+ * but don't necessarily fill it in at
+ * syscall entry time (i.e., filter
+ * instead). */
+ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ * and always fill it in at syscall
+ * entry time. This makes a full
+ * syscall record available if some
+ * other part of the kernel decides it
+ * should be recorded. */
+ AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ * always fill it in at syscall entry
+ * time, and always write out the audit
+ * record at syscall exit time. */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+ const char *name;
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ uid_t uid;
+ gid_t gid;
+ dev_t rdev;
+};
+
+struct audit_aux_data {
+ struct audit_aux_data *next;
+ int type;
+};
+
+#define AUDIT_AUX_IPCPERM 0
+
+struct audit_aux_data_ipcctl {
+ struct audit_aux_data d;
+ struct ipc_perm p;
+ unsigned long qbytes;
+ uid_t uid;
+ gid_t gid;
+ mode_t mode;
+};
+
+
+/* The per-task audit context. */
+struct audit_context {
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state;
+ unsigned int serial; /* serial number for record */
+ struct timespec ctime; /* time of syscall entry */
+ uid_t loginuid; /* login uid (identity) */
+ int major; /* syscall number */
+ unsigned long argv[4]; /* syscall arguments */
+ int return_valid; /* return code is valid */
+ int return_code;/* syscall return code */
+ int auditable; /* 1 if record should be written */
+ int name_count;
+ struct audit_names names[AUDIT_NAMES];
+ struct audit_context *previous; /* For nested syscalls */
+ struct audit_aux_data *aux;
+
+ /* Save things to print about task_struct */
+ pid_t pid;
+ uid_t uid, euid, suid, fsuid;
+ gid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
+#endif
+};
+
+ /* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical. It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+ int i;
+
+ if (a->flags != b->flags)
+ return 1;
+
+ if (a->action != b->action)
+ return 1;
+
+ if (a->field_count != b->field_count)
+ return 1;
+
+ for (i = 0; i < a->field_count; i++) {
+ if (a->fields[i] != b->fields[i]
+ || a->values[i] != b->values[i])
+ return 1;
+ }
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (a->mask[i] != b->mask[i])
+ return 1;
+
+ return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ if (entry->rule.flags & AUDIT_PREPEND) {
+ entry->rule.flags &= ~AUDIT_PREPEND;
+ list_add_rcu(&entry->list, list);
+ } else {
+ list_add_tail_rcu(&entry->list, list);
+ }
+ return 0;
+}
+
+static void audit_free_rule(struct rcu_head *head)
+{
+ struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+ kfree(e);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+
+ /* Do not use the _rcu iterator here, since this is the only
+ * deletion routine. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule);
+ return 0;
+ }
+ }
+ return -EFAULT; /* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space. Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+ int i;
+
+ if (s->action != AUDIT_NEVER
+ && s->action != AUDIT_POSSIBLE
+ && s->action != AUDIT_ALWAYS)
+ return -1;
+ if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+ return -1;
+
+ d->flags = s->flags;
+ d->action = s->action;
+ d->field_count = s->field_count;
+ for (i = 0; i < d->field_count; i++) {
+ d->fields[i] = s->fields[i];
+ d->values[i] = s->values[i];
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+ return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+ u32 flags;
+ struct audit_entry *entry;
+ int err = 0;
+
+ switch (type) {
+ case AUDIT_LIST:
+ /* The *_rcu iterators not needed here because we are
+ always called with audit_netlink_sem held. */
+ list_for_each_entry(entry, &audit_tsklist, list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
+ list_for_each_entry(entry, &audit_entlist, list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
+ list_for_each_entry(entry, &audit_extlist, list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
+ audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+ break;
+ case AUDIT_ADD:
+ if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+ return -ENOMEM;
+ if (audit_copy_rule(&entry->rule, data)) {
+ kfree(entry);
+ return -EINVAL;
+ }
+ flags = entry->rule.flags;
+ if (!err && (flags & AUDIT_PER_TASK))
+ err = audit_add_rule(entry, &audit_tsklist);
+ if (!err && (flags & AUDIT_AT_ENTRY))
+ err = audit_add_rule(entry, &audit_entlist);
+ if (!err && (flags & AUDIT_AT_EXIT))
+ err = audit_add_rule(entry, &audit_extlist);
+ break;
+ case AUDIT_DEL:
+ flags =((struct audit_rule *)data)->flags;
+ if (!err && (flags & AUDIT_PER_TASK))
+ err = audit_del_rule(data, &audit_tsklist);
+ if (!err && (flags & AUDIT_AT_ENTRY))
+ err = audit_del_rule(data, &audit_entlist);
+ if (!err && (flags & AUDIT_AT_EXIT))
+ err = audit_del_rule(data, &audit_extlist);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule. Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+ struct audit_rule *rule,
+ struct audit_context *ctx,
+ enum audit_state *state)
+{
+ int i, j;
+
+ for (i = 0; i < rule->field_count; i++) {
+ u32 field = rule->fields[i] & ~AUDIT_NEGATE;
+ u32 value = rule->values[i];
+ int result = 0;
+
+ switch (field) {
+ case AUDIT_PID:
+ result = (tsk->pid == value);
+ break;
+ case AUDIT_UID:
+ result = (tsk->uid == value);
+ break;
+ case AUDIT_EUID:
+ result = (tsk->euid == value);
+ break;
+ case AUDIT_SUID:
+ result = (tsk->suid == value);
+ break;
+ case AUDIT_FSUID:
+ result = (tsk->fsuid == value);
+ break;
+ case AUDIT_GID:
+ result = (tsk->gid == value);
+ break;
+ case AUDIT_EGID:
+ result = (tsk->egid == value);
+ break;
+ case AUDIT_SGID:
+ result = (tsk->sgid == value);
+ break;
+ case AUDIT_FSGID:
+ result = (tsk->fsgid == value);
+ break;
+ case AUDIT_PERS:
+ result = (tsk->personality == value);
+ break;
+
+ case AUDIT_EXIT:
+ if (ctx && ctx->return_valid)
+ result = (ctx->return_code == value);
+ break;
+ case AUDIT_SUCCESS:
+ if (ctx && ctx->return_valid)
+ result = (ctx->return_code >= 0);
+ break;
+ case AUDIT_DEVMAJOR:
+ if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (MAJOR(ctx->names[j].dev)==value) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_DEVMINOR:
+ if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (MINOR(ctx->names[j].dev)==value) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_INODE:
+ if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (ctx->names[j].ino == value) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_LOGINUID:
+ result = 0;
+ if (ctx)
+ result = (ctx->loginuid == value);
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ if (ctx)
+ result = (ctx->argv[field-AUDIT_ARG0]==value);
+ break;
+ }
+
+ if (rule->fields[i] & AUDIT_NEGATE)
+ result = !result;
+ if (!result)
+ return 0;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task. Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_tsklist, list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit
+ && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+ int return_valid,
+ int return_code)
+{
+ struct audit_context *context = tsk->audit_context;
+
+ if (likely(!context))
+ return NULL;
+ context->return_valid = return_valid;
+ context->return_code = return_code;
+
+ if (context->in_syscall && !context->auditable) {
+ enum audit_state state;
+ state = audit_filter_syscall(tsk, context, &audit_extlist);
+ if (state == AUDIT_RECORD_CONTEXT)
+ context->auditable = 1;
+ }
+
+ context->pid = tsk->pid;
+ context->uid = tsk->uid;
+ context->gid = tsk->gid;
+ context->euid = tsk->euid;
+ context->suid = tsk->suid;
+ context->fsuid = tsk->fsuid;
+ context->egid = tsk->egid;
+ context->sgid = tsk->sgid;
+ context->fsgid = tsk->fsgid;
+ context->personality = tsk->personality;
+ tsk->audit_context = NULL;
+ return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+ int i;
+
+#if AUDIT_DEBUG == 2
+ if (context->auditable
+ ||context->put_count + context->ino_count != context->name_count) {
+ printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d"
+ " ino_count=%d [NOT freeing]\n",
+ __LINE__,
+ context->serial, context->major, context->in_syscall,
+ context->name_count, context->put_count,
+ context->ino_count);
+ for (i = 0; i < context->name_count; i++)
+ printk(KERN_ERR "names[%d] = %p = %s\n", i,
+ context->names[i].name,
+ context->names[i].name);
+ dump_stack();
+ return;
+ }
+#endif
+#if AUDIT_DEBUG
+ context->put_count = 0;
+ context->ino_count = 0;
+#endif
+
+ for (i = 0; i < context->name_count; i++)
+ if (context->names[i].name)
+ __putname(context->names[i].name);
+ context->name_count = 0;
+}
+
+static inline void audit_free_aux(struct audit_context *context)
+{
+ struct audit_aux_data *aux;
+
+ while ((aux = context->aux)) {
+ context->aux = aux->next;
+ kfree(aux);
+ }
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+ enum audit_state state)
+{
+ uid_t loginuid = context->loginuid;
+
+ memset(context, 0, sizeof(*context));
+ context->state = state;
+ context->loginuid = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+ struct audit_context *context;
+
+ if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ return NULL;
+ audit_zero_context(context, state);
+ return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary. Doing so turns on system call auditing for the
+ * specified task. This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+ struct audit_context *context;
+ enum audit_state state;
+
+ if (likely(!audit_enabled))
+ return 0; /* Return if not auditing. */
+
+ state = audit_filter_task(tsk);
+ if (likely(state == AUDIT_DISABLED))
+ return 0;
+
+ if (!(context = audit_alloc_context(state))) {
+ audit_log_lost("out of memory in audit_alloc");
+ return -ENOMEM;
+ }
+
+ /* Preserve login uid */
+ context->loginuid = -1;
+ if (current->audit_context)
+ context->loginuid = current->audit_context->loginuid;
+
+ tsk->audit_context = context;
+ set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+ struct audit_context *previous;
+ int count = 0;
+
+ do {
+ previous = context->previous;
+ if (previous || (count && count < 10)) {
+ ++count;
+ printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+ " freeing multiple contexts (%d)\n",
+ context->serial, context->major,
+ context->name_count, count);
+ }
+ audit_free_names(context);
+ audit_free_aux(context);
+ kfree(context);
+ context = previous;
+ } while (context);
+ if (count >= 10)
+ printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+ int i;
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context);
+ if (!ab)
+ return; /* audit_panic has been called */
+ audit_log_format(ab, "syscall=%d", context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid)
+ audit_log_format(ab, " exit=%d", context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+ " pid=%d loginuid=%d uid=%d gid=%d"
+ " euid=%d suid=%d fsuid=%d"
+ " egid=%d sgid=%d fsgid=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count,
+ context->pid,
+ context->loginuid,
+ context->uid,
+ context->gid,
+ context->euid, context->suid, context->fsuid,
+ context->egid, context->sgid, context->fsgid);
+ audit_log_end(ab);
+ while (context->aux) {
+ struct audit_aux_data *aux;
+
+ ab = audit_log_start(context);
+ if (!ab)
+ continue; /* audit_panic has been called */
+
+ aux = context->aux;
+ context->aux = aux->next;
+
+ audit_log_format(ab, "auxitem=%d", aux->type);
+ switch (aux->type) {
+ case AUDIT_AUX_IPCPERM: {
+ struct audit_aux_data_ipcctl *axi = (void *)aux;
+ audit_log_format(ab,
+ " qbytes=%lx uid=%d gid=%d mode=%x",
+ axi->qbytes, axi->uid, axi->gid, axi->mode);
+ }
+ }
+ audit_log_end(ab);
+ kfree(aux);
+ }
+
+ for (i = 0; i < context->name_count; i++) {
+ ab = audit_log_start(context);
+ if (!ab)
+ continue; /* audit_panic has been called */
+ audit_log_format(ab, "item=%d", i);
+ if (context->names[i].name)
+ audit_log_format(ab, " name=%s",
+ context->names[i].name);
+ if (context->names[i].ino != (unsigned long)-1)
+ audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
+ " uid=%d gid=%d rdev=%02x:%02x",
+ context->names[i].ino,
+ MAJOR(context->names[i].dev),
+ MINOR(context->names[i].dev),
+ context->names[i].mode,
+ context->names[i].uid,
+ context->names[i].gid,
+ MAJOR(context->names[i].rdev),
+ MINOR(context->names[i].rdev));
+ audit_log_end(ab);
+ }
+}
+
+/* Free a per-task audit context. Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+ struct audit_context *context;
+
+ task_lock(tsk);
+ context = audit_get_context(tsk, 0, 0);
+ task_unlock(tsk);
+
+ if (likely(!context))
+ return;
+
+ /* Check for system calls that do not go through the exit
+ * function (e.g., exit_group), then free context block. */
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context);
+
+ audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record. Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces. The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record. The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit. However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+ static atomic_t serial = ATOMIC_INIT(0xffffff);
+ unsigned int a, b;
+
+ do {
+ a = atomic_read(&serial);
+ if (atomic_dec_and_test(&serial))
+ atomic_set(&serial, 0xffffff);
+ b = atomic_read(&serial);
+ } while (b != a - 1);
+
+ return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry. This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built. If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4)
+{
+ struct audit_context *context = tsk->audit_context;
+ enum audit_state state;
+
+ BUG_ON(!context);
+
+ /* This happens only on certain architectures that make system
+ * calls in kernel_thread via the entry.S interface, instead of
+ * with direct calls. (If you are porting to a new
+ * architecture, hitting this condition can indicate that you
+ * got the _exit/_leave calls backward in entry.S.)
+ *
+ * i386 no
+ * x86_64 no
+ * ppc64 yes (see arch/ppc64/kernel/misc.S)
+ *
+ * This also happens with vm86 emulation in a non-nested manner
+ * (entries without exits), so this case must be caught.
+ */
+ if (context->in_syscall) {
+ struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+ /* vm86 mode should only be entered once */
+ if (major == __NR_vm86 || major == __NR_vm86old)
+ return;
+#endif
+#if AUDIT_DEBUG
+ printk(KERN_ERR
+ "audit(:%d) pid=%d in syscall=%d;"
+ " entering syscall=%d\n",
+ context->serial, tsk->pid, context->major, major);
+#endif
+ newctx = audit_alloc_context(context->state);
+ if (newctx) {
+ newctx->previous = context;
+ context = newctx;
+ tsk->audit_context = newctx;
+ } else {
+ /* If we can't alloc a new context, the best we
+ * can do is to leak memory (any pending putname
+ * will be lost). The only other alternative is
+ * to abandon auditing. */
+ audit_zero_context(context, context->state);
+ }
+ }
+ BUG_ON(context->in_syscall || context->name_count);
+
+ if (!audit_enabled)
+ return;
+
+ context->major = major;
+ context->argv[0] = a1;
+ context->argv[1] = a2;
+ context->argv[2] = a3;
+ context->argv[3] = a4;
+
+ state = context->state;
+ if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+ state = audit_filter_syscall(tsk, context, &audit_entlist);
+ if (likely(state == AUDIT_DISABLED))
+ return;
+
+ context->serial = audit_serial();
+ context->ctime = CURRENT_TIME;
+ context->in_syscall = 1;
+ context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call. If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information. In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+ struct audit_context *context;
+
+ get_task_struct(tsk);
+ task_lock(tsk);
+ context = audit_get_context(tsk, 1, return_code);
+ task_unlock(tsk);
+
+ /* Not having a context here is ok, since the parent may have
+ * called __put_task_struct. */
+ if (likely(!context))
+ return;
+
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context);
+
+ context->in_syscall = 0;
+ context->auditable = 0;
+ if (context->previous) {
+ struct audit_context *new_context = context->previous;
+ context->previous = NULL;
+ audit_free_context(context);
+ tsk->audit_context = new_context;
+ } else {
+ audit_free_names(context);
+ audit_free_aux(context);
+ audit_zero_context(context, context->state);
+ tsk->audit_context = context;
+ }
+ put_task_struct(tsk);
+}
+
+/* Add a name to the list. Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ if (!context || IS_ERR(name) || !name)
+ return;
+
+ if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+ printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ __FILE__, __LINE__, context->serial, name);
+ dump_stack();
+#endif
+ return;
+ }
+ BUG_ON(context->name_count >= AUDIT_NAMES);
+ context->names[context->name_count].name = name;
+ context->names[context->name_count].ino = (unsigned long)-1;
+ ++context->name_count;
+}
+
+/* Intercept a putname request. Called from
+ * include/linux/fs.h:putname(). If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ BUG_ON(!context);
+ if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+ printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ __FILE__, __LINE__, context->serial, name);
+ if (context->name_count) {
+ int i;
+ for (i = 0; i < context->name_count; i++)
+ printk(KERN_ERR "name[%d] = %p = %s\n", i,
+ context->names[i].name,
+ context->names[i].name);
+ }
+#endif
+ __putname(name);
+ }
+#if AUDIT_DEBUG
+ else {
+ ++context->put_count;
+ if (context->put_count > context->name_count) {
+ printk(KERN_ERR "%s:%d(:%d): major=%d"
+ " in_syscall=%d putname(%p) name_count=%d"
+ " put_count=%d\n",
+ __FILE__, __LINE__,
+ context->serial, context->major,
+ context->in_syscall, name, context->name_count,
+ context->put_count);
+ dump_stack();
+ }
+ }
+#endif
+}
+
+/* Store the inode and device from a lookup. Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, const struct inode *inode)
+{
+ int idx;
+ struct audit_context *context = current->audit_context;
+
+ if (!context->in_syscall)
+ return;
+ if (context->name_count
+ && context->names[context->name_count-1].name
+ && context->names[context->name_count-1].name == name)
+ idx = context->name_count - 1;
+ else if (context->name_count > 1
+ && context->names[context->name_count-2].name
+ && context->names[context->name_count-2].name == name)
+ idx = context->name_count - 2;
+ else {
+ /* FIXME: how much do we care about inodes that have no
+ * associated name? */
+ if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+ return;
+ idx = context->name_count++;
+ context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+ ++context->ino_count;
+#endif
+ }
+ context->names[idx].ino = inode->i_ino;
+ context->names[idx].dev = inode->i_sb->s_dev;
+ context->names[idx].mode = inode->i_mode;
+ context->names[idx].uid = inode->i_uid;
+ context->names[idx].gid = inode->i_gid;
+ context->names[idx].rdev = inode->i_rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+ struct timespec *t, int *serial)
+{
+ if (ctx) {
+ t->tv_sec = ctx->ctime.tv_sec;
+ t->tv_nsec = ctx->ctime.tv_nsec;
+ *serial = ctx->serial;
+ ctx->auditable = 1;
+ } else {
+ *t = CURRENT_TIME;
+ *serial = 0;
+ }
+}
+
+extern int audit_set_type(struct audit_buffer *ab, int type);
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+ if (ctx) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL);
+ if (ab) {
+ audit_log_format(ab, "login pid=%d uid=%u "
+ "old loginuid=%u new loginuid=%u",
+ ctx->pid, ctx->uid, ctx->loginuid, loginuid);
+ audit_set_type(ab, AUDIT_LOGIN);
+ audit_log_end(ab);
+ }
+ ctx->loginuid = loginuid;
+ }
+ return 0;
+}
+
+uid_t audit_get_loginuid(struct audit_context *ctx)
+{
+ return ctx ? ctx->loginuid : -1;
+}
+
+int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct audit_aux_data_ipcctl *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->qbytes = qbytes;
+ ax->uid = uid;
+ ax->gid = gid;
+ ax->mode = mode;
+
+ ax->d.type = AUDIT_AUX_IPCPERM;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
diff --git a/kernel/capability.c b/kernel/capability.c
new file mode 100644
index 000000000000..64db1ee820c2
--- /dev/null
+++ b/kernel/capability.c
@@ -0,0 +1,220 @@
+/*
+ * linux/kernel/capability.c
+ *
+ * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
+ *
+ * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com>
+ * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
+
+EXPORT_SYMBOL(securebits);
+EXPORT_SYMBOL(cap_bset);
+
+/*
+ * This lock protects task->cap_* for all tasks including current.
+ * Locking rule: acquire this prior to tasklist_lock.
+ */
+static DEFINE_SPINLOCK(task_capability_lock);
+
+/*
+ * For sys_getproccap() and sys_setproccap(), any of the three
+ * capability set pointers may be NULL -- indicating that that set is
+ * uninteresting and/or not to be changed.
+ */
+
+/*
+ * sys_capget - get the capabilities of a given process.
+ */
+asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+{
+ int ret = 0;
+ pid_t pid;
+ __u32 version;
+ task_t *target;
+ struct __user_cap_data_struct data;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ if (version != _LINUX_CAPABILITY_VERSION) {
+ if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ if (pid && pid != current->pid) {
+ target = find_task_by_pid(pid);
+ if (!target) {
+ ret = -ESRCH;
+ goto out;
+ }
+ } else
+ target = current;
+
+ ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
+
+out:
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ if (!ret && copy_to_user(dataptr, &data, sizeof data))
+ return -EFAULT;
+
+ return ret;
+}
+
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group. We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ task_t *g, *target;
+ int ret = -EPERM;
+ int found = 0;
+
+ do_each_task_pid(pgrp, PIDTYPE_PGID, g) {
+ target = g;
+ while_each_thread(g, target) {
+ if (!security_capset_check(target, effective,
+ inheritable,
+ permitted)) {
+ security_capset_set(target, effective,
+ inheritable,
+ permitted);
+ ret = 0;
+ }
+ found = 1;
+ }
+ } while_each_task_pid(pgrp, PIDTYPE_PGID, g);
+
+ if (!found)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * cap_set_all - set capabilities for all processes other than init
+ * and self. We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_all(kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ task_t *g, *target;
+ int ret = -EPERM;
+ int found = 0;
+
+ do_each_thread(g, target) {
+ if (target == current || target->pid == 1)
+ continue;
+ found = 1;
+ if (security_capset_check(target, effective, inheritable,
+ permitted))
+ continue;
+ ret = 0;
+ security_capset_set(target, effective, inheritable, permitted);
+ } while_each_thread(g, target);
+
+ if (!found)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * sys_capset - set capabilities for a given process, all processes, or all
+ * processes in a given process group.
+ *
+ * The restrictions on setting capabilities are specified as:
+ *
+ * [pid is for the 'target' task. 'current' is the calling task.]
+ *
+ * I: any raised capabilities must be a subset of the (old current) permitted
+ * P: any raised capabilities must be a subset of the (old current) permitted
+ * E: must be set to a subset of (new target) permitted
+ */
+asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
+{
+ kernel_cap_t inheritable, permitted, effective;
+ __u32 version;
+ task_t *target;
+ int ret;
+ pid_t pid;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ if (version != _LINUX_CAPABILITY_VERSION) {
+ if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (pid && pid != current->pid && !capable(CAP_SETPCAP))
+ return -EPERM;
+
+ if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
+ copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
+ copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
+ return -EFAULT;
+
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+ if (pid > 0 && pid != current->pid) {
+ target = find_task_by_pid(pid);
+ if (!target) {
+ ret = -ESRCH;
+ goto out;
+ }
+ } else
+ target = current;
+
+ ret = 0;
+
+ /* having verified that the proposed changes are legal,
+ we now put them into effect. */
+ if (pid < 0) {
+ if (pid == -1) /* all procs other than current and init */
+ ret = cap_set_all(&effective, &inheritable, &permitted);
+
+ else /* all procs in process group */
+ ret = cap_set_pg(-pid, &effective, &inheritable,
+ &permitted);
+ } else {
+ ret = security_capset_check(target, &effective, &inheritable,
+ &permitted);
+ if (!ret)
+ security_capset_set(target, &effective, &inheritable,
+ &permitted);
+ }
+
+out:
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ return ret;
+}
diff --git a/kernel/compat.c b/kernel/compat.c
new file mode 100644
index 000000000000..dad10656bf14
--- /dev/null
+++ b/kernel/compat.c
@@ -0,0 +1,860 @@
+/*
+ * linux/kernel/compat.c
+ *
+ * Kernel compatibililty routines for e.g. 32 bit syscall support
+ * on 64 bit kernels.
+ *
+ * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
+#include <linux/futex.h> /* for FUTEX_WAIT */
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/security.h>
+
+#include <asm/uaccess.h>
+#include <asm/bug.h>
+
+int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+{
+ return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
+ __get_user(ts->tv_sec, &cts->tv_sec) ||
+ __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+{
+ return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
+ __put_user(ts->tv_sec, &cts->tv_sec) ||
+ __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+ unsigned long expire = restart->arg0, now = jiffies;
+ struct compat_timespec __user *rmtp;
+
+ /* Did it expire while we handled signals? */
+ if (!time_after(expire, now))
+ return 0;
+
+ current->state = TASK_INTERRUPTIBLE;
+ expire = schedule_timeout(expire - now);
+ if (expire == 0)
+ return 0;
+
+ rmtp = (struct compat_timespec __user *)restart->arg1;
+ if (rmtp) {
+ struct compat_timespec ct;
+ struct timespec t;
+
+ jiffies_to_timespec(expire, &t);
+ ct.tv_sec = t.tv_sec;
+ ct.tv_nsec = t.tv_nsec;
+ if (copy_to_user(rmtp, &ct, sizeof(ct)))
+ return -EFAULT;
+ }
+ /* The 'restart' block is already filled in */
+ return -ERESTART_RESTARTBLOCK;
+}
+
+asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
+ struct compat_timespec __user *rmtp)
+{
+ struct timespec t;
+ struct restart_block *restart;
+ unsigned long expire;
+
+ if (get_compat_timespec(&t, rqtp))
+ return -EFAULT;
+
+ if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
+ return -EINVAL;
+
+ expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+ current->state = TASK_INTERRUPTIBLE;
+ expire = schedule_timeout(expire);
+ if (expire == 0)
+ return 0;
+
+ if (rmtp) {
+ jiffies_to_timespec(expire, &t);
+ if (put_compat_timespec(&t, rmtp))
+ return -EFAULT;
+ }
+ restart = &current_thread_info()->restart_block;
+ restart->fn = compat_nanosleep_restart;
+ restart->arg0 = jiffies + expire;
+ restart->arg1 = (unsigned long) rmtp;
+ return -ERESTART_RESTARTBLOCK;
+}
+
+static inline long get_compat_itimerval(struct itimerval *o,
+ struct compat_itimerval __user *i)
+{
+ return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
+ (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
+ __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
+ __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
+ __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
+}
+
+static inline long put_compat_itimerval(struct compat_itimerval __user *o,
+ struct itimerval *i)
+{
+ return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
+ (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
+ __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
+ __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
+ __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
+}
+
+asmlinkage long compat_sys_getitimer(int which,
+ struct compat_itimerval __user *it)
+{
+ struct itimerval kit;
+ int error;
+
+ error = do_getitimer(which, &kit);
+ if (!error && put_compat_itimerval(it, &kit))
+ error = -EFAULT;
+ return error;
+}
+
+asmlinkage long compat_sys_setitimer(int which,
+ struct compat_itimerval __user *in,
+ struct compat_itimerval __user *out)
+{
+ struct itimerval kin, kout;
+ int error;
+
+ if (in) {
+ if (get_compat_itimerval(&kin, in))
+ return -EFAULT;
+ } else
+ memset(&kin, 0, sizeof(kin));
+
+ error = do_setitimer(which, &kin, out ? &kout : NULL);
+ if (error || !out)
+ return error;
+ if (put_compat_itimerval(out, &kout))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+{
+ /*
+ * In the SMP world we might just be unlucky and have one of
+ * the times increment as we use it. Since the value is an
+ * atomically safe type this is just fine. Conceptually its
+ * as if the syscall took an instant longer to occur.
+ */
+ if (tbuf) {
+ struct compat_tms tmp;
+ struct task_struct *tsk = current;
+ struct task_struct *t;
+ cputime_t utime, stime, cutime, cstime;
+
+ read_lock(&tasklist_lock);
+ utime = tsk->signal->utime;
+ stime = tsk->signal->stime;
+ t = tsk;
+ do {
+ utime = cputime_add(utime, t->utime);
+ stime = cputime_add(stime, t->stime);
+ t = next_thread(t);
+ } while (t != tsk);
+
+ /*
+ * While we have tasklist_lock read-locked, no dying thread
+ * can be updating current->signal->[us]time. Instead,
+ * we got their counts included in the live thread loop.
+ * However, another thread can come in right now and
+ * do a wait call that updates current->signal->c[us]time.
+ * To make sure we always see that pair updated atomically,
+ * we take the siglock around fetching them.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ cutime = tsk->signal->cutime;
+ cstime = tsk->signal->cstime;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+
+ tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
+ tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
+ tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
+ tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
+ if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+ return -EFAULT;
+ }
+ return compat_jiffies_to_clock_t(jiffies);
+}
+
+/*
+ * Assumption: old_sigset_t and compat_old_sigset_t are both
+ * types that can be passed to put_user()/get_user().
+ */
+
+asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+{
+ old_sigset_t s;
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_sigpending((old_sigset_t __user *) &s);
+ set_fs(old_fs);
+ if (ret == 0)
+ ret = put_user(s, set);
+ return ret;
+}
+
+asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
+ compat_old_sigset_t __user *oset)
+{
+ old_sigset_t s;
+ long ret;
+ mm_segment_t old_fs;
+
+ if (set && get_user(s, set))
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_sigprocmask(how,
+ set ? (old_sigset_t __user *) &s : NULL,
+ oset ? (old_sigset_t __user *) &s : NULL);
+ set_fs(old_fs);
+ if (ret == 0)
+ if (oset)
+ ret = put_user(s, oset);
+ return ret;
+}
+
+#ifdef CONFIG_FUTEX
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+ struct compat_timespec __user *utime, u32 __user *uaddr2,
+ int val3)
+{
+ struct timespec t;
+ unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ int val2 = 0;
+
+ if ((op == FUTEX_WAIT) && utime) {
+ if (get_compat_timespec(&t, utime))
+ return -EFAULT;
+ timeout = timespec_to_jiffies(&t) + 1;
+ }
+ if (op >= FUTEX_REQUEUE)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex((unsigned long)uaddr, op, val, timeout,
+ (unsigned long)uaddr2, val2, val3);
+}
+#endif
+
+asmlinkage long compat_sys_setrlimit(unsigned int resource,
+ struct compat_rlimit __user *rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs ();
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
+ __get_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __get_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+
+ if (r.rlim_cur == COMPAT_RLIM_INFINITY)
+ r.rlim_cur = RLIM_INFINITY;
+ if (r.rlim_max == COMPAT_RLIM_INFINITY)
+ r.rlim_max = RLIM_INFINITY;
+ set_fs(KERNEL_DS);
+ ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
+ set_fs(old_fs);
+ return ret;
+}
+
+#ifdef COMPAT_RLIM_OLD_INFINITY
+
+asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
+ struct compat_rlimit __user *rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_old_getrlimit(resource, &r);
+ set_fs(old_fs);
+
+ if (!ret) {
+ if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
+ r.rlim_cur = COMPAT_RLIM_INFINITY;
+ if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
+ r.rlim_max = COMPAT_RLIM_INFINITY;
+
+ if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+ __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+#endif
+
+asmlinkage long compat_sys_getrlimit (unsigned int resource,
+ struct compat_rlimit __user *rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
+ set_fs(old_fs);
+ if (!ret) {
+ if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+ r.rlim_cur = COMPAT_RLIM_INFINITY;
+ if (r.rlim_max > COMPAT_RLIM_INFINITY)
+ r.rlim_max = COMPAT_RLIM_INFINITY;
+
+ if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+ __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
+{
+ if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
+ __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
+ __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
+ __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
+ __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
+ __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
+ __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
+ __put_user(r->ru_idrss, &ru->ru_idrss) ||
+ __put_user(r->ru_isrss, &ru->ru_isrss) ||
+ __put_user(r->ru_minflt, &ru->ru_minflt) ||
+ __put_user(r->ru_majflt, &ru->ru_majflt) ||
+ __put_user(r->ru_nswap, &ru->ru_nswap) ||
+ __put_user(r->ru_inblock, &ru->ru_inblock) ||
+ __put_user(r->ru_oublock, &ru->ru_oublock) ||
+ __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
+ __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
+ __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
+ __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
+ __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
+{
+ struct rusage r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_getrusage(who, (struct rusage __user *) &r);
+ set_fs(old_fs);
+
+ if (ret)
+ return ret;
+
+ if (put_compat_rusage(&r, ru))
+ return -EFAULT;
+
+ return 0;
+}
+
+asmlinkage long
+compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
+ struct compat_rusage __user *ru)
+{
+ if (!ru) {
+ return sys_wait4(pid, stat_addr, options, NULL);
+ } else {
+ struct rusage r;
+ int ret;
+ unsigned int status;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs (KERNEL_DS);
+ ret = sys_wait4(pid,
+ (stat_addr ?
+ (unsigned int __user *) &status : NULL),
+ options, (struct rusage __user *) &r);
+ set_fs (old_fs);
+
+ if (ret > 0) {
+ if (put_compat_rusage(&r, ru))
+ return -EFAULT;
+ if (stat_addr && put_user(status, stat_addr))
+ return -EFAULT;
+ }
+ return ret;
+ }
+}
+
+asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
+ struct compat_siginfo __user *uinfo, int options,
+ struct compat_rusage __user *uru)
+{
+ siginfo_t info;
+ struct rusage ru;
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ memset(&info, 0, sizeof(info));
+
+ set_fs(KERNEL_DS);
+ ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
+ uru ? (struct rusage __user *)&ru : NULL);
+ set_fs(old_fs);
+
+ if ((ret < 0) || (info.si_signo == 0))
+ return ret;
+
+ if (uru) {
+ ret = put_compat_rusage(&ru, uru);
+ if (ret)
+ return ret;
+ }
+
+ BUG_ON(info.si_code & __SI_MASK);
+ info.si_code |= __SI_CHLD;
+ return copy_siginfo_to_user32(uinfo, &info);
+}
+
+static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
+ unsigned len, cpumask_t *new_mask)
+{
+ unsigned long *k;
+
+ if (len < sizeof(cpumask_t))
+ memset(new_mask, 0, sizeof(cpumask_t));
+ else if (len > sizeof(cpumask_t))
+ len = sizeof(cpumask_t);
+
+ k = cpus_addr(*new_mask);
+ return compat_get_bitmap(k, user_mask_ptr, len * 8);
+}
+
+asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
+ unsigned int len,
+ compat_ulong_t __user *user_mask_ptr)
+{
+ cpumask_t new_mask;
+ int retval;
+
+ retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+ if (retval)
+ return retval;
+
+ return sched_setaffinity(pid, new_mask);
+}
+
+asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
+ compat_ulong_t __user *user_mask_ptr)
+{
+ int ret;
+ cpumask_t mask;
+ unsigned long *k;
+ unsigned int min_length = sizeof(cpumask_t);
+
+ if (NR_CPUS <= BITS_PER_COMPAT_LONG)
+ min_length = sizeof(compat_ulong_t);
+
+ if (len < min_length)
+ return -EINVAL;
+
+ ret = sched_getaffinity(pid, &mask);
+ if (ret < 0)
+ return ret;
+
+ k = cpus_addr(mask);
+ ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
+ if (ret)
+ return ret;
+
+ return min_length;
+}
+
+static int get_compat_itimerspec(struct itimerspec *dst,
+ struct compat_itimerspec __user *src)
+{
+ if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
+ get_compat_timespec(&dst->it_value, &src->it_value))
+ return -EFAULT;
+ return 0;
+}
+
+static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
+ struct itimerspec *src)
+{
+ if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
+ put_compat_timespec(&src->it_value, &dst->it_value))
+ return -EFAULT;
+ return 0;
+}
+
+long compat_sys_timer_settime(timer_t timer_id, int flags,
+ struct compat_itimerspec __user *new,
+ struct compat_itimerspec __user *old)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct itimerspec newts, oldts;
+
+ if (!new)
+ return -EINVAL;
+ if (get_compat_itimerspec(&newts, new))
+ return -EFAULT;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_timer_settime(timer_id, flags,
+ (struct itimerspec __user *) &newts,
+ (struct itimerspec __user *) &oldts);
+ set_fs(oldfs);
+ if (!err && old && put_compat_itimerspec(old, &oldts))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_timer_gettime(timer_t timer_id,
+ struct compat_itimerspec __user *setting)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct itimerspec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_timer_gettime(timer_id,
+ (struct itimerspec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && put_compat_itimerspec(setting, &ts))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_clock_settime(clockid_t which_clock,
+ struct compat_timespec __user *tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ if (get_compat_timespec(&ts, tp))
+ return -EFAULT;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_settime(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ return err;
+}
+
+long compat_sys_clock_gettime(clockid_t which_clock,
+ struct compat_timespec __user *tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_gettime(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && put_compat_timespec(&ts, tp))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_clock_getres(clockid_t which_clock,
+ struct compat_timespec __user *tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_getres(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && tp && put_compat_timespec(&ts, tp))
+ return -EFAULT;
+ return err;
+}
+
+long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
+ struct compat_timespec __user *rqtp,
+ struct compat_timespec __user *rmtp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec in, out;
+
+ if (get_compat_timespec(&in, rqtp))
+ return -EFAULT;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_nanosleep(which_clock, flags,
+ (struct timespec __user *) &in,
+ (struct timespec __user *) &out);
+ set_fs(oldfs);
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ put_compat_timespec(&out, rmtp))
+ return -EFAULT;
+ return err;
+}
+
+/*
+ * We currently only need the following fields from the sigevent
+ * structure: sigev_value, sigev_signo, sig_notify and (sometimes
+ * sigev_notify_thread_id). The others are handled in user mode.
+ * We also assume that copying sigev_value.sival_int is sufficient
+ * to keep all the bits of sigev_value.sival_ptr intact.
+ */
+int get_compat_sigevent(struct sigevent *event,
+ const struct compat_sigevent __user *u_event)
+{
+ memset(&event, 0, sizeof(*event));
+ return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+ __get_user(event->sigev_value.sival_int,
+ &u_event->sigev_value.sival_int) ||
+ __get_user(event->sigev_signo, &u_event->sigev_signo) ||
+ __get_user(event->sigev_notify, &u_event->sigev_notify) ||
+ __get_user(event->sigev_notify_thread_id,
+ &u_event->sigev_notify_thread_id))
+ ? -EFAULT : 0;
+}
+
+/* timer_create is architecture specific because it needs sigevent conversion */
+
+long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+ unsigned long bitmap_size)
+{
+ int i, j;
+ unsigned long m;
+ compat_ulong_t um;
+ unsigned long nr_compat_longs;
+
+ /* align bitmap up to nearest compat_long_t boundary */
+ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+ if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
+ return -EFAULT;
+
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+ for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+ m = 0;
+
+ for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+ /*
+ * We dont want to read past the end of the userspace
+ * bitmap. We must however ensure the end of the
+ * kernel bitmap is zeroed.
+ */
+ if (nr_compat_longs-- > 0) {
+ if (__get_user(um, umask))
+ return -EFAULT;
+ } else {
+ um = 0;
+ }
+
+ umask++;
+ m |= (long)um << (j * BITS_PER_COMPAT_LONG);
+ }
+ *mask++ = m;
+ }
+
+ return 0;
+}
+
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+ unsigned long bitmap_size)
+{
+ int i, j;
+ unsigned long m;
+ compat_ulong_t um;
+ unsigned long nr_compat_longs;
+
+ /* align bitmap up to nearest compat_long_t boundary */
+ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+ if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
+ return -EFAULT;
+
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+ for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+ m = *mask++;
+
+ for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+ um = m;
+
+ /*
+ * We dont want to write past the end of the userspace
+ * bitmap.
+ */
+ if (nr_compat_longs-- > 0) {
+ if (__put_user(um, umask))
+ return -EFAULT;
+ }
+
+ umask++;
+ m >>= 4*sizeof(um);
+ m >>= 4*sizeof(um);
+ }
+ }
+
+ return 0;
+}
+
+void
+sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+{
+ switch (_NSIG_WORDS) {
+#if defined (__COMPAT_ENDIAN_SWAP__)
+ case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
+ case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
+ case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
+ case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
+#else
+ case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
+ case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
+ case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
+ case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+#endif
+ }
+}
+
+asmlinkage long
+compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
+ struct compat_siginfo __user *uinfo,
+ struct compat_timespec __user *uts, compat_size_t sigsetsize)
+{
+ compat_sigset_t s32;
+ sigset_t s;
+ int sig;
+ struct timespec t;
+ siginfo_t info;
+ long ret, timeout = 0;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&s, &s32);
+ sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
+ signotset(&s);
+
+ if (uts) {
+ if (get_compat_timespec (&t, uts))
+ return -EFAULT;
+ if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
+ || t.tv_sec < 0)
+ return -EINVAL;
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &s, &info);
+ if (!sig) {
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ if (uts)
+ timeout = timespec_to_jiffies(&t)
+ +(t.tv_sec || t.tv_nsec);
+ if (timeout) {
+ current->real_blocked = current->blocked;
+ sigandsets(&current->blocked, &current->blocked, &s);
+
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ timeout = schedule_timeout(timeout);
+
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &s, &info);
+ current->blocked = current->real_blocked;
+ siginitset(&current->real_blocked, 0);
+ recalc_sigpending();
+ }
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (sig) {
+ ret = sig;
+ if (uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+ }else {
+ ret = timeout?-EINTR:-EAGAIN;
+ }
+ return ret;
+
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+
+/* compat_time_t is a 32 bit "long" and needs to get converted. */
+
+asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+{
+ compat_time_t i;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ i = tv.tv_sec;
+
+ if (tloc) {
+ if (put_user(i,tloc))
+ i = -EFAULT;
+ }
+ return i;
+}
+
+asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+{
+ struct timespec tv;
+ int err;
+
+ if (get_user(tv.tv_sec, tptr))
+ return -EFAULT;
+
+ tv.tv_nsec = 0;
+
+ err = security_settime(&tv, NULL);
+ if (err)
+ return err;
+
+ do_settimeofday(&tv);
+ return 0;
+}
+
+#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
diff --git a/kernel/configs.c b/kernel/configs.c
new file mode 100644
index 000000000000..986f7af31e0a
--- /dev/null
+++ b/kernel/configs.c
@@ -0,0 +1,118 @@
+/*
+ * kernel/configs.c
+ * Echo the kernel .config file used to build the kernel
+ *
+ * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
+ * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
+ * Copyright (C) 2002 Hewlett-Packard Company
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/**************************************************/
+/* the actual current config file */
+
+/*
+ * Define kernel_config_data and kernel_config_data_size, which contains the
+ * wrapped and compressed configuration file. The file is first compressed
+ * with gzip and then bounded by two eight byte magic numbers to allow
+ * extraction from a binary kernel image:
+ *
+ * IKCFG_ST
+ * <image>
+ * IKCFG_ED
+ */
+#define MAGIC_START "IKCFG_ST"
+#define MAGIC_END "IKCFG_ED"
+#include "config_data.h"
+
+
+#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
+#define kernel_config_data_size \
+ (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+
+#ifdef CONFIG_IKCONFIG_PROC
+
+/**************************************************/
+/* globals and useful constants */
+
+static ssize_t
+ikconfig_read_current(struct file *file, char __user *buf,
+ size_t len, loff_t * offset)
+{
+ loff_t pos = *offset;
+ ssize_t count;
+
+ if (pos >= kernel_config_data_size)
+ return 0;
+
+ count = min(len, (size_t)(kernel_config_data_size - pos));
+ if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
+ return -EFAULT;
+
+ *offset += count;
+ return count;
+}
+
+static struct file_operations ikconfig_file_ops = {
+ .owner = THIS_MODULE,
+ .read = ikconfig_read_current,
+};
+
+/***************************************************/
+/* ikconfig_init: start up everything we need to */
+
+static int __init ikconfig_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ /* create the current config file */
+ entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
+ &proc_root);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->proc_fops = &ikconfig_file_ops;
+ entry->size = kernel_config_data_size;
+
+ return 0;
+}
+
+/***************************************************/
+/* ikconfig_cleanup: clean up our mess */
+
+static void __exit ikconfig_cleanup(void)
+{
+ remove_proc_entry("config.gz", &proc_root);
+}
+
+module_init(ikconfig_init);
+module_exit(ikconfig_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Randy Dunlap");
+MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
+
+#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpu.c b/kernel/cpu.c
new file mode 100644
index 000000000000..628f4ccda127
--- /dev/null
+++ b/kernel/cpu.c
@@ -0,0 +1,193 @@
+/* CPU control.
+ * (C) 2001, 2002, 2003, 2004 Rusty Russell
+ *
+ * This code is licenced under the GPL.
+ */
+#include <linux/proc_fs.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/stop_machine.h>
+#include <asm/semaphore.h>
+
+/* This protects CPUs going up and down... */
+DECLARE_MUTEX(cpucontrol);
+
+static struct notifier_block *cpu_chain;
+
+/* Need to know about CPUs going up/down? */
+int register_cpu_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ if ((ret = down_interruptible(&cpucontrol)) != 0)
+ return ret;
+ ret = notifier_chain_register(&cpu_chain, nb);
+ up(&cpucontrol);
+ return ret;
+}
+EXPORT_SYMBOL(register_cpu_notifier);
+
+void unregister_cpu_notifier(struct notifier_block *nb)
+{
+ down(&cpucontrol);
+ notifier_chain_unregister(&cpu_chain, nb);
+ up(&cpucontrol);
+}
+EXPORT_SYMBOL(unregister_cpu_notifier);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void check_for_tasks(int cpu)
+{
+ struct task_struct *p;
+
+ write_lock_irq(&tasklist_lock);
+ for_each_process(p) {
+ if (task_cpu(p) == cpu &&
+ (!cputime_eq(p->utime, cputime_zero) ||
+ !cputime_eq(p->stime, cputime_zero)))
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+ (state = %ld, flags = %lx) \n",
+ p->comm, p->pid, cpu, p->state, p->flags);
+ }
+ write_unlock_irq(&tasklist_lock);
+}
+
+/* Take this CPU down. */
+static int take_cpu_down(void *unused)
+{
+ int err;
+
+ /* Take offline: makes arch_cpu_down somewhat easier. */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+
+ /* Ensure this CPU doesn't handle any more interrupts. */
+ err = __cpu_disable();
+ if (err < 0)
+ cpu_set(smp_processor_id(), cpu_online_map);
+ else
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();
+
+ return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+ int err;
+ struct task_struct *p;
+ cpumask_t old_allowed, tmp;
+
+ if ((err = lock_cpu_hotplug_interruptible()) != 0)
+ return err;
+
+ if (num_online_cpus() == 1) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!cpu_online(cpu)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+ (void *)(long)cpu);
+ if (err == NOTIFY_BAD) {
+ printk("%s: attempt to take down CPU %u failed\n",
+ __FUNCTION__, cpu);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ tmp = CPU_MASK_ALL;
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed(current, tmp);
+
+ p = __stop_machine_run(take_cpu_down, NULL, cpu);
+ if (IS_ERR(p)) {
+ /* CPU didn't die: tell everyone. Can't complain. */
+ if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+ (void *)(long)cpu) == NOTIFY_BAD)
+ BUG();
+
+ err = PTR_ERR(p);
+ goto out_allowed;
+ }
+
+ if (cpu_online(cpu))
+ goto out_thread;
+
+ /* Wait for it to sleep (leaving idle task). */
+ while (!idle_cpu(cpu))
+ yield();
+
+ /* This actually kills the CPU. */
+ __cpu_die(cpu);
+
+ /* Move it here so it can run. */
+ kthread_bind(p, get_cpu());
+ put_cpu();
+
+ /* CPU is completely dead: tell everyone. Too late to complain. */
+ if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
+ == NOTIFY_BAD)
+ BUG();
+
+ check_for_tasks(cpu);
+
+out_thread:
+ err = kthread_stop(p);
+out_allowed:
+ set_cpus_allowed(current, old_allowed);
+out:
+ unlock_cpu_hotplug();
+ return err;
+}
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+int __devinit cpu_up(unsigned int cpu)
+{
+ int ret;
+ void *hcpu = (void *)(long)cpu;
+
+ if ((ret = down_interruptible(&cpucontrol)) != 0)
+ return ret;
+
+ if (cpu_online(cpu) || !cpu_present(cpu)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ if (ret == NOTIFY_BAD) {
+ printk("%s: attempt to bring up CPU %u failed\n",
+ __FUNCTION__, cpu);
+ ret = -EINVAL;
+ goto out_notify;
+ }
+
+ /* Arch-specific enabling code. */
+ ret = __cpu_up(cpu);
+ if (ret != 0)
+ goto out_notify;
+ if (!cpu_online(cpu))
+ BUG();
+
+ /* Now call notifier in preparation. */
+ notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+
+out_notify:
+ if (ret != 0)
+ notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+out:
+ up(&cpucontrol);
+ return ret;
+}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
new file mode 100644
index 000000000000..69792bbe2281
--- /dev/null
+++ b/kernel/cpuset.c
@@ -0,0 +1,1564 @@
+/*
+ * kernel/cpuset.c
+ *
+ * Processor and Memory placement constraints for sets of tasks.
+ *
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004 Silicon Graphics, Inc.
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ * Portions Copyright (c) 2004 Silicon Graphics, Inc.
+ *
+ * 2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CPUSET_SUPER_MAGIC 0x27e0eb
+
+struct cpuset {
+ unsigned long flags; /* "unsigned long" so bitops work */
+ cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
+ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ atomic_t count; /* count tasks using this cpuset */
+
+ /*
+ * We link our 'sibling' struct into our parents 'children'.
+ * Our children link their 'sibling' into our 'children'.
+ */
+ struct list_head sibling; /* my parents children */
+ struct list_head children; /* my children */
+
+ struct cpuset *parent; /* my parent */
+ struct dentry *dentry; /* cpuset fs entry */
+
+ /*
+ * Copy of global cpuset_mems_generation as of the most
+ * recent time this cpuset changed its mems_allowed.
+ */
+ int mems_generation;
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_REMOVED,
+ CS_NOTIFY_ON_RELEASE
+} cpuset_flagbits_t;
+
+/* convenient tests for these bits */
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+ return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+ return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_removed(const struct cpuset *cs)
+{
+ return !!test_bit(CS_REMOVED, &cs->flags);
+}
+
+static inline int notify_on_release(const struct cpuset *cs)
+{
+ return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+}
+
+/*
+ * Increment this atomic integer everytime any cpuset changes its
+ * mems_allowed value. Users of cpusets can track this generation
+ * number, and avoid having to lock and reload mems_allowed unless
+ * the cpuset they're using changes generation.
+ *
+ * A single, global generation is needed because attach_task() could
+ * reattach a task to a different cpuset, which must not have its
+ * generation numbers aliased with those of that tasks previous cpuset.
+ *
+ * Generations are needed for mems_allowed because one task cannot
+ * modify anothers memory placement. So we must enable every task,
+ * on every visit to __alloc_pages(), to efficiently check whether
+ * its current->cpuset->mems_allowed has changed, requiring an update
+ * of its current->mems_allowed.
+ */
+static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
+
+static struct cpuset top_cpuset = {
+ .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .cpus_allowed = CPU_MASK_ALL,
+ .mems_allowed = NODE_MASK_ALL,
+ .count = ATOMIC_INIT(0),
+ .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
+ .children = LIST_HEAD_INIT(top_cpuset.children),
+ .parent = NULL,
+ .dentry = NULL,
+ .mems_generation = 0,
+};
+
+static struct vfsmount *cpuset_mount;
+static struct super_block *cpuset_sb = NULL;
+
+/*
+ * cpuset_sem should be held by anyone who is depending on the children
+ * or sibling lists of any cpuset, or performing non-atomic operations
+ * on the flags or *_allowed values of a cpuset, such as raising the
+ * CS_REMOVED flag bit iff it is not already raised, or reading and
+ * conditionally modifying the *_allowed values. One kernel global
+ * cpuset semaphore should be sufficient - these things don't change
+ * that much.
+ *
+ * The code that modifies cpusets holds cpuset_sem across the entire
+ * operation, from cpuset_common_file_write() down, single threading
+ * all cpuset modifications (except for counter manipulations from
+ * fork and exit) across the system. This presumes that cpuset
+ * modifications are rare - better kept simple and safe, even if slow.
+ *
+ * The code that reads cpusets, such as in cpuset_common_file_read()
+ * and below, only holds cpuset_sem across small pieces of code, such
+ * as when reading out possibly multi-word cpumasks and nodemasks, as
+ * the risks are less, and the desire for performance a little greater.
+ * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
+ * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
+ *
+ * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
+ * (usually) grab cpuset_sem. These are the two most performance
+ * critical pieces of code here. The exception occurs on exit(),
+ * if the last task using a cpuset exits, and the cpuset was marked
+ * notify_on_release. In that case, the cpuset_sem is taken, the
+ * path to the released cpuset calculated, and a usermode call made
+ * to /sbin/cpuset_release_agent with the name of the cpuset (path
+ * relative to the root of cpuset file system) as the argument.
+ *
+ * A cpuset can only be deleted if both its 'count' of using tasks is
+ * zero, and its list of 'children' cpusets is empty. Since all tasks
+ * in the system use _some_ cpuset, and since there is always at least
+ * one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks. So no need
+ * for any special hack to ensure that top_cpuset cannot be deleted.
+ */
+
+static DECLARE_MUTEX(cpuset_sem);
+
+/*
+ * A couple of forward declarations required, due to cyclic reference loop:
+ * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
+ * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
+ */
+
+static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
+
+static struct backing_dev_info cpuset_backing_dev_info = {
+ .ra_pages = 0, /* No readahead */
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *cpuset_new_inode(mode_t mode)
+{
+ struct inode *inode = new_inode(cpuset_sb);
+
+ if (inode) {
+ inode->i_mode = mode;
+ inode->i_uid = current->fsuid;
+ inode->i_gid = current->fsgid;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
+ }
+ return inode;
+}
+
+static void cpuset_diput(struct dentry *dentry, struct inode *inode)
+{
+ /* is dentry a directory ? if so, kfree() associated cpuset */
+ if (S_ISDIR(inode->i_mode)) {
+ struct cpuset *cs = dentry->d_fsdata;
+ BUG_ON(!(is_removed(cs)));
+ kfree(cs);
+ }
+ iput(inode);
+}
+
+static struct dentry_operations cpuset_dops = {
+ .d_iput = cpuset_diput,
+};
+
+static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
+{
+ struct qstr qstr;
+ struct dentry *d;
+
+ qstr.name = name;
+ qstr.len = strlen(name);
+ qstr.hash = full_name_hash(name, qstr.len);
+ d = lookup_hash(&qstr, parent);
+ if (!IS_ERR(d))
+ d->d_op = &cpuset_dops;
+ return d;
+}
+
+static void remove_dir(struct dentry *d)
+{
+ struct dentry *parent = dget(d->d_parent);
+
+ d_delete(d);
+ simple_rmdir(parent->d_inode, d);
+ dput(parent);
+}
+
+/*
+ * NOTE : the dentry must have been dget()'ed
+ */
+static void cpuset_d_remove_dir(struct dentry *dentry)
+{
+ struct list_head *node;
+
+ spin_lock(&dcache_lock);
+ node = dentry->d_subdirs.next;
+ while (node != &dentry->d_subdirs) {
+ struct dentry *d = list_entry(node, struct dentry, d_child);
+ list_del_init(node);
+ if (d->d_inode) {
+ d = dget_locked(d);
+ spin_unlock(&dcache_lock);
+ d_delete(d);
+ simple_unlink(dentry->d_inode, d);
+ dput(d);
+ spin_lock(&dcache_lock);
+ }
+ node = dentry->d_subdirs.next;
+ }
+ list_del_init(&dentry->d_child);
+ spin_unlock(&dcache_lock);
+ remove_dir(dentry);
+}
+
+static struct super_operations cpuset_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+};
+
+static int cpuset_fill_super(struct super_block *sb, void *unused_data,
+ int unused_silent)
+{
+ struct inode *inode;
+ struct dentry *root;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = CPUSET_SUPER_MAGIC;
+ sb->s_op = &cpuset_ops;
+ cpuset_sb = sb;
+
+ inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
+ if (inode) {
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ /* directories start off with i_nlink == 2 (for "." entry) */
+ inode->i_nlink++;
+ } else {
+ return -ENOMEM;
+ }
+
+ root = d_alloc_root(inode);
+ if (!root) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ sb->s_root = root;
+ return 0;
+}
+
+static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .get_sb = cpuset_get_sb,
+ .kill_sb = kill_litter_super,
+};
+
+/* struct cftype:
+ *
+ * The files in the cpuset filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ * - the cpuset to use in file->f_dentry->d_parent->d_fsdata
+ * - the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct cftype {
+ char *name;
+ int private;
+ int (*open) (struct inode *inode, struct file *file);
+ ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
+ loff_t *ppos);
+ int (*write) (struct file *file, const char __user *buf, size_t nbytes,
+ loff_t *ppos);
+ int (*release) (struct inode *inode, struct file *file);
+};
+
+static inline struct cpuset *__d_cs(struct dentry *dentry)
+{
+ return dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+ return dentry->d_fsdata;
+}
+
+/*
+ * Call with cpuset_sem held. Writes path of cpuset into buf.
+ * Returns 0 on success, -errno on error.
+ */
+
+static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
+{
+ char *start;
+
+ start = buf + buflen;
+
+ *--start = '\0';
+ for (;;) {
+ int len = cs->dentry->d_name.len;
+ if ((start -= len) < buf)
+ return -ENAMETOOLONG;
+ memcpy(start, cs->dentry->d_name.name, len);
+ cs = cs->parent;
+ if (!cs)
+ break;
+ if (!cs->parent)
+ continue;
+ if (--start < buf)
+ return -ENAMETOOLONG;
+ *start = '/';
+ }
+ memmove(buf, start, buf + buflen - start);
+ return 0;
+}
+
+/*
+ * Notify userspace when a cpuset is released, by running
+ * /sbin/cpuset_release_agent with the name of the cpuset (path
+ * relative to the root of cpuset file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cpuset.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cpuset before it is removed, or that some other
+ * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cpuset is no longer
+ * unused, and this cpuset will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * Note final arg to call_usermodehelper() is 0 - that means
+ * don't wait. Since we are holding the global cpuset_sem here,
+ * and we are asking another thread (started from keventd) to rmdir a
+ * cpuset, we can't wait - or we'd deadlock with the removing thread
+ * on cpuset_sem.
+ */
+
+static int cpuset_release_agent(char *cpuset_str)
+{
+ char *argv[3], *envp[3];
+ int i;
+
+ i = 0;
+ argv[i++] = "/sbin/cpuset_release_agent";
+ argv[i++] = cpuset_str;
+ argv[i] = NULL;
+
+ i = 0;
+ /* minimal command environment */
+ envp[i++] = "HOME=/";
+ envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[i] = NULL;
+
+ return call_usermodehelper(argv[0], argv, envp, 0);
+}
+
+/*
+ * Either cs->count of using tasks transitioned to zero, or the
+ * cs->children list of child cpusets just became empty. If this
+ * cs is notify_on_release() and now both the user count is zero and
+ * the list of children is empty, send notice to user land.
+ */
+
+static void check_for_release(struct cpuset *cs)
+{
+ if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
+ list_empty(&cs->children)) {
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return;
+ if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
+ goto out;
+ cpuset_release_agent(buf);
+out:
+ kfree(buf);
+ }
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's cpus_allowed that
+ * are online. If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online cpus. If we get
+ * all the way to the top and still haven't found any online cpus,
+ * return cpu_online_map. Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_map.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of cpu_online_map.
+ *
+ * Call with cpuset_sem held.
+ */
+
+static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
+{
+ while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
+ cs = cs->parent;
+ if (cs)
+ cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
+ else
+ *pmask = cpu_online_map;
+ BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's mems_allowed that
+ * are online. If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online mems. If we get
+ * all the way to the top and still haven't found any online mems,
+ * return node_online_map.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of node_online_map.
+ *
+ * Call with cpuset_sem held.
+ */
+
+static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+{
+ while (cs && !nodes_intersects(cs->mems_allowed, node_online_map))
+ cs = cs->parent;
+ if (cs)
+ nodes_and(*pmask, cs->mems_allowed, node_online_map);
+ else
+ *pmask = node_online_map;
+ BUG_ON(!nodes_intersects(*pmask, node_online_map));
+}
+
+/*
+ * Refresh current tasks mems_allowed and mems_generation from
+ * current tasks cpuset. Call with cpuset_sem held.
+ *
+ * Be sure to call refresh_mems() on any cpuset operation which
+ * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * Call after obtaining cpuset_sem lock, before any possible
+ * allocation. Otherwise one risks trying to allocate memory
+ * while the task cpuset_mems_generation is not the same as
+ * the mems_generation in its cpuset, which would deadlock on
+ * cpuset_sem in cpuset_update_current_mems_allowed().
+ *
+ * Since we hold cpuset_sem, once refresh_mems() is called, the
+ * test (current->cpuset_mems_generation != cs->mems_generation)
+ * in cpuset_update_current_mems_allowed() will remain false,
+ * until we drop cpuset_sem. Anyone else who would change our
+ * cpusets mems_generation needs to lock cpuset_sem first.
+ */
+
+static void refresh_mems(void)
+{
+ struct cpuset *cs = current->cpuset;
+
+ if (current->cpuset_mems_generation != cs->mems_generation) {
+ guarantee_online_mems(cs, &current->mems_allowed);
+ current->cpuset_mems_generation = cs->mems_generation;
+ }
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+ return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+ nodes_subset(p->mems_allowed, q->mems_allowed) &&
+ is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+ is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ * follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid? Presumes
+ * cpuset_sem held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset. Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+{
+ struct cpuset *c, *par;
+
+ /* Each of our child cpusets must be a subset of us */
+ list_for_each_entry(c, &cur->children, sibling) {
+ if (!is_cpuset_subset(c, trial))
+ return -EBUSY;
+ }
+
+ /* Remaining checks don't apply to root cpuset */
+ if ((par = cur->parent) == NULL)
+ return 0;
+
+ /* We must be a subset of our parent cpuset */
+ if (!is_cpuset_subset(trial, par))
+ return -EACCES;
+
+ /* If either I or some sibling (!= me) is exclusive, we can't overlap */
+ list_for_each_entry(c, &par->children, sibling) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
+ return -EINVAL;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int update_cpumask(struct cpuset *cs, char *buf)
+{
+ struct cpuset trialcs;
+ int retval;
+
+ trialcs = *cs;
+ retval = cpulist_parse(buf, trialcs.cpus_allowed);
+ if (retval < 0)
+ return retval;
+ cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
+ if (cpus_empty(trialcs.cpus_allowed))
+ return -ENOSPC;
+ retval = validate_change(cs, &trialcs);
+ if (retval == 0)
+ cs->cpus_allowed = trialcs.cpus_allowed;
+ return retval;
+}
+
+static int update_nodemask(struct cpuset *cs, char *buf)
+{
+ struct cpuset trialcs;
+ int retval;
+
+ trialcs = *cs;
+ retval = nodelist_parse(buf, trialcs.mems_allowed);
+ if (retval < 0)
+ return retval;
+ nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
+ if (nodes_empty(trialcs.mems_allowed))
+ return -ENOSPC;
+ retval = validate_change(cs, &trialcs);
+ if (retval == 0) {
+ cs->mems_allowed = trialcs.mems_allowed;
+ atomic_inc(&cpuset_mems_generation);
+ cs->mems_generation = atomic_read(&cpuset_mems_generation);
+ }
+ return retval;
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
+ * CS_NOTIFY_ON_RELEASE)
+ * cs: the cpuset to update
+ * buf: the buffer where we read the 0 or 1
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+{
+ int turning_on;
+ struct cpuset trialcs;
+ int err;
+
+ turning_on = (simple_strtoul(buf, NULL, 10) != 0);
+
+ trialcs = *cs;
+ if (turning_on)
+ set_bit(bit, &trialcs.flags);
+ else
+ clear_bit(bit, &trialcs.flags);
+
+ err = validate_change(cs, &trialcs);
+ if (err == 0) {
+ if (turning_on)
+ set_bit(bit, &cs->flags);
+ else
+ clear_bit(bit, &cs->flags);
+ }
+ return err;
+}
+
+static int attach_task(struct cpuset *cs, char *buf)
+{
+ pid_t pid;
+ struct task_struct *tsk;
+ struct cpuset *oldcs;
+ cpumask_t cpus;
+
+ if (sscanf(buf, "%d", &pid) != 1)
+ return -EIO;
+ if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+
+ if (pid) {
+ read_lock(&tasklist_lock);
+
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+
+ if ((current->euid) && (current->euid != tsk->uid)
+ && (current->euid != tsk->suid)) {
+ put_task_struct(tsk);
+ return -EACCES;
+ }
+ } else {
+ tsk = current;
+ get_task_struct(tsk);
+ }
+
+ task_lock(tsk);
+ oldcs = tsk->cpuset;
+ if (!oldcs) {
+ task_unlock(tsk);
+ put_task_struct(tsk);
+ return -ESRCH;
+ }
+ atomic_inc(&cs->count);
+ tsk->cpuset = cs;
+ task_unlock(tsk);
+
+ guarantee_online_cpus(cs, &cpus);
+ set_cpus_allowed(tsk, cpus);
+
+ put_task_struct(tsk);
+ if (atomic_dec_and_test(&oldcs->count))
+ check_for_release(oldcs);
+ return 0;
+}
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_ROOT,
+ FILE_DIR,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_NOTIFY_ON_RELEASE,
+ FILE_TASKLIST,
+} cpuset_filetype_t;
+
+static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
+{
+ struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct cftype *cft = __d_cft(file->f_dentry);
+ cpuset_filetype_t type = cft->private;
+ char *buffer;
+ int retval = 0;
+
+ /* Crude upper limit on largest legitimate cpulist user might write. */
+ if (nbytes > 100 + 6 * NR_CPUS)
+ return -E2BIG;
+
+ /* +1 for nul-terminator */
+ if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+ return -ENOMEM;
+
+ if (copy_from_user(buffer, userbuf, nbytes)) {
+ retval = -EFAULT;
+ goto out1;
+ }
+ buffer[nbytes] = 0; /* nul-terminate */
+
+ down(&cpuset_sem);
+
+ if (is_removed(cs)) {
+ retval = -ENODEV;
+ goto out2;
+ }
+
+ switch (type) {
+ case FILE_CPULIST:
+ retval = update_cpumask(cs, buffer);
+ break;
+ case FILE_MEMLIST:
+ retval = update_nodemask(cs, buffer);
+ break;
+ case FILE_CPU_EXCLUSIVE:
+ retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+ break;
+ case FILE_NOTIFY_ON_RELEASE:
+ retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
+ break;
+ case FILE_TASKLIST:
+ retval = attach_task(cs, buffer);
+ break;
+ default:
+ retval = -EINVAL;
+ goto out2;
+ }
+
+ if (retval == 0)
+ retval = nbytes;
+out2:
+ up(&cpuset_sem);
+out1:
+ kfree(buffer);
+ return retval;
+}
+
+static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ ssize_t retval = 0;
+ struct cftype *cft = __d_cft(file->f_dentry);
+ if (!cft)
+ return -ENODEV;
+
+ /* special function ? */
+ if (cft->write)
+ retval = cft->write(file, buf, nbytes, ppos);
+ else
+ retval = cpuset_common_file_write(file, buf, nbytes, ppos);
+
+ return retval;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map. If read in smaller
+ * chunks, there is no guarantee of atomicity. Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ * A single large read to a buffer that crosses a page boundary is
+ * ok, because the result being copied to user land is not recomputed
+ * across a page fault.
+ */
+
+static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+{
+ cpumask_t mask;
+
+ down(&cpuset_sem);
+ mask = cs->cpus_allowed;
+ up(&cpuset_sem);
+
+ return cpulist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+{
+ nodemask_t mask;
+
+ down(&cpuset_sem);
+ mask = cs->mems_allowed;
+ up(&cpuset_sem);
+
+ return nodelist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct cftype *cft = __d_cft(file->f_dentry);
+ struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ cpuset_filetype_t type = cft->private;
+ char *page;
+ ssize_t retval = 0;
+ char *s;
+ char *start;
+ size_t n;
+
+ if (!(page = (char *)__get_free_page(GFP_KERNEL)))
+ return -ENOMEM;
+
+ s = page;
+
+ switch (type) {
+ case FILE_CPULIST:
+ s += cpuset_sprintf_cpulist(s, cs);
+ break;
+ case FILE_MEMLIST:
+ s += cpuset_sprintf_memlist(s, cs);
+ break;
+ case FILE_CPU_EXCLUSIVE:
+ *s++ = is_cpu_exclusive(cs) ? '1' : '0';
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ *s++ = is_mem_exclusive(cs) ? '1' : '0';
+ break;
+ case FILE_NOTIFY_ON_RELEASE:
+ *s++ = notify_on_release(cs) ? '1' : '0';
+ break;
+ default:
+ retval = -EINVAL;
+ goto out;
+ }
+ *s++ = '\n';
+ *s = '\0';
+
+ start = page + *ppos;
+ n = s - start;
+ retval = n - copy_to_user(buf, start, min(n, nbytes));
+ *ppos += retval;
+out:
+ free_page((unsigned long)page);
+ return retval;
+}
+
+static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
+ loff_t *ppos)
+{
+ ssize_t retval = 0;
+ struct cftype *cft = __d_cft(file->f_dentry);
+ if (!cft)
+ return -ENODEV;
+
+ /* special function ? */
+ if (cft->read)
+ retval = cft->read(file, buf, nbytes, ppos);
+ else
+ retval = cpuset_common_file_read(file, buf, nbytes, ppos);
+
+ return retval;
+}
+
+static int cpuset_file_open(struct inode *inode, struct file *file)
+{
+ int err;
+ struct cftype *cft;
+
+ err = generic_file_open(inode, file);
+ if (err)
+ return err;
+
+ cft = __d_cft(file->f_dentry);
+ if (!cft)
+ return -ENODEV;
+ if (cft->open)
+ err = cft->open(inode, file);
+ else
+ err = 0;
+
+ return err;
+}
+
+static int cpuset_file_release(struct inode *inode, struct file *file)
+{
+ struct cftype *cft = __d_cft(file->f_dentry);
+ if (cft->release)
+ return cft->release(inode, file);
+ return 0;
+}
+
+static struct file_operations cpuset_file_operations = {
+ .read = cpuset_file_read,
+ .write = cpuset_file_write,
+ .llseek = generic_file_llseek,
+ .open = cpuset_file_open,
+ .release = cpuset_file_release,
+};
+
+static struct inode_operations cpuset_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = cpuset_mkdir,
+ .rmdir = cpuset_rmdir,
+};
+
+static int cpuset_create_file(struct dentry *dentry, int mode)
+{
+ struct inode *inode;
+
+ if (!dentry)
+ return -ENOENT;
+ if (dentry->d_inode)
+ return -EEXIST;
+
+ inode = cpuset_new_inode(mode);
+ if (!inode)
+ return -ENOMEM;
+
+ if (S_ISDIR(mode)) {
+ inode->i_op = &cpuset_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+
+ /* start off with i_nlink == 2 (for "." entry) */
+ inode->i_nlink++;
+ } else if (S_ISREG(mode)) {
+ inode->i_size = 0;
+ inode->i_fop = &cpuset_file_operations;
+ }
+
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ return 0;
+}
+
+/*
+ * cpuset_create_dir - create a directory for an object.
+ * cs: the cpuset we create the directory for.
+ * It must have a valid ->parent field
+ * And we are going to fill its ->dentry field.
+ * name: The name to give to the cpuset directory. Will be copied.
+ * mode: mode to set on new directory.
+ */
+
+static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
+{
+ struct dentry *dentry = NULL;
+ struct dentry *parent;
+ int error = 0;
+
+ parent = cs->parent->dentry;
+ dentry = cpuset_get_dentry(parent, name);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ error = cpuset_create_file(dentry, S_IFDIR | mode);
+ if (!error) {
+ dentry->d_fsdata = cs;
+ parent->d_inode->i_nlink++;
+ cs->dentry = dentry;
+ }
+ dput(dentry);
+
+ return error;
+}
+
+static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
+{
+ struct dentry *dentry;
+ int error;
+
+ down(&dir->d_inode->i_sem);
+ dentry = cpuset_get_dentry(dir, cft->name);
+ if (!IS_ERR(dentry)) {
+ error = cpuset_create_file(dentry, 0644 | S_IFREG);
+ if (!error)
+ dentry->d_fsdata = (void *)cft;
+ dput(dentry);
+ } else
+ error = PTR_ERR(dentry);
+ up(&dir->d_inode->i_sem);
+ return error;
+}
+
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a cpuset has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ * Upon tasks file open(), a struct ctr_struct is allocated, that
+ * will have a pointer to an array (also allocated here). The struct
+ * ctr_struct * is stored in file->private_data. Its resources will
+ * be freed by release() when the file is closed. The array is used
+ * to sprintf the PIDs and then used by read().
+ */
+
+/* cpusets_tasks_read array */
+
+struct ctr_struct {
+ char *buf;
+ int bufsz;
+};
+
+/*
+ * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
+ * Return actual number of pids loaded.
+ */
+static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
+{
+ int n = 0;
+ struct task_struct *g, *p;
+
+ read_lock(&tasklist_lock);
+
+ do_each_thread(g, p) {
+ if (p->cpuset == cs) {
+ pidarray[n++] = p->pid;
+ if (unlikely(n == npids))
+ goto array_full;
+ }
+ } while_each_thread(g, p);
+
+array_full:
+ read_unlock(&tasklist_lock);
+ return n;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Convert array 'a' of 'npids' pid_t's to a string of newline separated
+ * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
+ * count 'cnt' of how many chars would be written if buf were large enough.
+ */
+static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+{
+ int cnt = 0;
+ int i;
+
+ for (i = 0; i < npids; i++)
+ cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
+ return cnt;
+}
+
+static int cpuset_tasks_open(struct inode *unused, struct file *file)
+{
+ struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct ctr_struct *ctr;
+ pid_t *pidarray;
+ int npids;
+ char c;
+
+ if (!(file->f_mode & FMODE_READ))
+ return 0;
+
+ ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
+ if (!ctr)
+ goto err0;
+
+ /*
+ * If cpuset gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cpuset users didn't
+ * show up until sometime later on.
+ */
+ npids = atomic_read(&cs->count);
+ pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+ if (!pidarray)
+ goto err1;
+
+ npids = pid_array_load(pidarray, npids, cs);
+ sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+
+ /* Call pid_array_to_buf() twice, first just to get bufsz */
+ ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
+ ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
+ if (!ctr->buf)
+ goto err2;
+ ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
+
+ kfree(pidarray);
+ file->private_data = ctr;
+ return 0;
+
+err2:
+ kfree(pidarray);
+err1:
+ kfree(ctr);
+err0:
+ return -ENOMEM;
+}
+
+static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct ctr_struct *ctr = file->private_data;
+
+ if (*ppos + nbytes > ctr->bufsz)
+ nbytes = ctr->bufsz - *ppos;
+ if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
+ return -EFAULT;
+ *ppos += nbytes;
+ return nbytes;
+}
+
+static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
+{
+ struct ctr_struct *ctr;
+
+ if (file->f_mode & FMODE_READ) {
+ ctr = file->private_data;
+ kfree(ctr->buf);
+ kfree(ctr);
+ }
+ return 0;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype cft_tasks = {
+ .name = "tasks",
+ .open = cpuset_tasks_open,
+ .read = cpuset_tasks_read,
+ .release = cpuset_tasks_release,
+ .private = FILE_TASKLIST,
+};
+
+static struct cftype cft_cpus = {
+ .name = "cpus",
+ .private = FILE_CPULIST,
+};
+
+static struct cftype cft_mems = {
+ .name = "mems",
+ .private = FILE_MEMLIST,
+};
+
+static struct cftype cft_cpu_exclusive = {
+ .name = "cpu_exclusive",
+ .private = FILE_CPU_EXCLUSIVE,
+};
+
+static struct cftype cft_mem_exclusive = {
+ .name = "mem_exclusive",
+ .private = FILE_MEM_EXCLUSIVE,
+};
+
+static struct cftype cft_notify_on_release = {
+ .name = "notify_on_release",
+ .private = FILE_NOTIFY_ON_RELEASE,
+};
+
+static int cpuset_populate_dir(struct dentry *cs_dentry)
+{
+ int err;
+
+ if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
+ return err;
+ return 0;
+}
+
+/*
+ * cpuset_create - create a cpuset
+ * parent: cpuset that will be parent of the new cpuset.
+ * name: name of the new cpuset. Will be strcpy'ed.
+ * mode: mode to set on new inode
+ *
+ * Must be called with the semaphore on the parent inode held
+ */
+
+static long cpuset_create(struct cpuset *parent, const char *name, int mode)
+{
+ struct cpuset *cs;
+ int err;
+
+ cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
+ down(&cpuset_sem);
+ refresh_mems();
+ cs->flags = 0;
+ if (notify_on_release(parent))
+ set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+ cs->cpus_allowed = CPU_MASK_NONE;
+ cs->mems_allowed = NODE_MASK_NONE;
+ atomic_set(&cs->count, 0);
+ INIT_LIST_HEAD(&cs->sibling);
+ INIT_LIST_HEAD(&cs->children);
+ atomic_inc(&cpuset_mems_generation);
+ cs->mems_generation = atomic_read(&cpuset_mems_generation);
+
+ cs->parent = parent;
+
+ list_add(&cs->sibling, &cs->parent->children);
+
+ err = cpuset_create_dir(cs, name, mode);
+ if (err < 0)
+ goto err;
+
+ /*
+ * Release cpuset_sem before cpuset_populate_dir() because it
+ * will down() this new directory's i_sem and if we race with
+ * another mkdir, we might deadlock.
+ */
+ up(&cpuset_sem);
+
+ err = cpuset_populate_dir(cs->dentry);
+ /* If err < 0, we have a half-filled directory - oh well ;) */
+ return 0;
+err:
+ list_del(&cs->sibling);
+ up(&cpuset_sem);
+ kfree(cs);
+ return err;
+}
+
+static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct cpuset *c_parent = dentry->d_parent->d_fsdata;
+
+ /* the vfs holds inode->i_sem already */
+ return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
+}
+
+static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+ struct cpuset *cs = dentry->d_fsdata;
+ struct dentry *d;
+ struct cpuset *parent;
+
+ /* the vfs holds both inode->i_sem already */
+
+ down(&cpuset_sem);
+ refresh_mems();
+ if (atomic_read(&cs->count) > 0) {
+ up(&cpuset_sem);
+ return -EBUSY;
+ }
+ if (!list_empty(&cs->children)) {
+ up(&cpuset_sem);
+ return -EBUSY;
+ }
+ spin_lock(&cs->dentry->d_lock);
+ parent = cs->parent;
+ set_bit(CS_REMOVED, &cs->flags);
+ list_del(&cs->sibling); /* delete my sibling from parent->children */
+ if (list_empty(&parent->children))
+ check_for_release(parent);
+ d = dget(cs->dentry);
+ cs->dentry = NULL;
+ spin_unlock(&d->d_lock);
+ cpuset_d_remove_dir(d);
+ dput(d);
+ up(&cpuset_sem);
+ return 0;
+}
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset and the cpuset internal file system,
+ **/
+
+int __init cpuset_init(void)
+{
+ struct dentry *root;
+ int err;
+
+ top_cpuset.cpus_allowed = CPU_MASK_ALL;
+ top_cpuset.mems_allowed = NODE_MASK_ALL;
+
+ atomic_inc(&cpuset_mems_generation);
+ top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
+
+ init_task.cpuset = &top_cpuset;
+
+ err = register_filesystem(&cpuset_fs_type);
+ if (err < 0)
+ goto out;
+ cpuset_mount = kern_mount(&cpuset_fs_type);
+ if (IS_ERR(cpuset_mount)) {
+ printk(KERN_ERR "cpuset: could not mount!\n");
+ err = PTR_ERR(cpuset_mount);
+ cpuset_mount = NULL;
+ goto out;
+ }
+ root = cpuset_mount->mnt_sb->s_root;
+ root->d_fsdata = &top_cpuset;
+ root->d_inode->i_nlink++;
+ top_cpuset.dentry = root;
+ root->d_inode->i_op = &cpuset_dir_inode_operations;
+ err = cpuset_populate_dir(root);
+out:
+ return err;
+}
+
+/**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Finish top cpuset after cpu, node maps are initialized
+ **/
+
+void __init cpuset_init_smp(void)
+{
+ top_cpuset.cpus_allowed = cpu_online_map;
+ top_cpuset.mems_allowed = node_online_map;
+}
+
+/**
+ * cpuset_fork - attach newly forked task to its parents cpuset.
+ * @p: pointer to task_struct of forking parent process.
+ *
+ * Description: By default, on fork, a task inherits its
+ * parents cpuset. The pointer to the shared cpuset is
+ * automatically copied in fork.c by dup_task_struct().
+ * This cpuset_fork() routine need only increment the usage
+ * counter in that cpuset.
+ **/
+
+void cpuset_fork(struct task_struct *tsk)
+{
+ atomic_inc(&tsk->cpuset->count);
+}
+
+/**
+ * cpuset_exit - detach cpuset from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cpuset from @tsk and release it.
+ *
+ **/
+
+void cpuset_exit(struct task_struct *tsk)
+{
+ struct cpuset *cs;
+
+ task_lock(tsk);
+ cs = tsk->cpuset;
+ tsk->cpuset = NULL;
+ task_unlock(tsk);
+
+ if (atomic_dec_and_test(&cs->count)) {
+ down(&cpuset_sem);
+ check_for_release(cs);
+ up(&cpuset_sem);
+ }
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ *
+ * Description: Returns the cpumask_t cpus_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of cpu_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+{
+ cpumask_t mask;
+
+ down(&cpuset_sem);
+ task_lock((struct task_struct *)tsk);
+ guarantee_online_cpus(tsk->cpuset, &mask);
+ task_unlock((struct task_struct *)tsk);
+ up(&cpuset_sem);
+
+ return mask;
+}
+
+void cpuset_init_current_mems_allowed(void)
+{
+ current->mems_allowed = NODE_MASK_ALL;
+}
+
+/*
+ * If the current tasks cpusets mems_allowed changed behind our backs,
+ * update current->mems_allowed and mems_generation to the new value.
+ * Do not call this routine if in_interrupt().
+ */
+
+void cpuset_update_current_mems_allowed(void)
+{
+ struct cpuset *cs = current->cpuset;
+
+ if (!cs)
+ return; /* task is exiting */
+ if (current->cpuset_mems_generation != cs->mems_generation) {
+ down(&cpuset_sem);
+ refresh_mems();
+ up(&cpuset_sem);
+ }
+}
+
+void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
+{
+ bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
+ MAX_NUMNODES);
+}
+
+/*
+ * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
+ */
+int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+{
+ int i;
+
+ for (i = 0; zl->zones[i]; i++) {
+ int nid = zl->zones[i]->zone_pgdat->node_id;
+
+ if (node_isset(nid, current->mems_allowed))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Is 'current' valid, and is zone z allowed in current->mems_allowed?
+ */
+int cpuset_zone_allowed(struct zone *z)
+{
+ return in_interrupt() ||
+ node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+}
+
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ */
+
+static int proc_cpuset_show(struct seq_file *m, void *v)
+{
+ struct cpuset *cs;
+ struct task_struct *tsk;
+ char *buf;
+ int retval = 0;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ tsk = m->private;
+ down(&cpuset_sem);
+ task_lock(tsk);
+ cs = tsk->cpuset;
+ task_unlock(tsk);
+ if (!cs) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = cpuset_path(cs, buf, PAGE_SIZE);
+ if (retval < 0)
+ goto out;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+out:
+ up(&cpuset_sem);
+ kfree(buf);
+ return retval;
+}
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+ struct task_struct *tsk = PROC_I(inode)->task;
+ return single_open(file, proc_cpuset_show, tsk);
+}
+
+struct file_operations proc_cpuset_operations = {
+ .open = cpuset_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
+{
+ buffer += sprintf(buffer, "Cpus_allowed:\t");
+ buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
+ buffer += sprintf(buffer, "\n");
+ buffer += sprintf(buffer, "Mems_allowed:\t");
+ buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
+ buffer += sprintf(buffer, "\n");
+ return buffer;
+}
diff --git a/kernel/dma.c b/kernel/dma.c
new file mode 100644
index 000000000000..aef0a45b7893
--- /dev/null
+++ b/kernel/dma.c
@@ -0,0 +1,158 @@
+/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $
+ * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
+ *
+ * Written by Hennus Bergman, 1992.
+ *
+ * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
+ * In the previous version the reported device could end up being wrong,
+ * if a device requested a DMA channel that was already in use.
+ * [It also happened to remove the sizeof(char *) == sizeof(int)
+ * assumption introduced because of those /proc/dma patches. -- Hennus]
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/dma.h>
+#include <asm/system.h>
+
+
+
+/* A note on resource allocation:
+ *
+ * All drivers needing DMA channels, should allocate and release them
+ * through the public routines `request_dma()' and `free_dma()'.
+ *
+ * In order to avoid problems, all processes should allocate resources in
+ * the same sequence and release them in the reverse order.
+ *
+ * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
+ * When releasing them, first release the DMA, then release the IRQ.
+ * If you don't, you may cause allocation requests to fail unnecessarily.
+ * This doesn't really matter now, but it will once we get real semaphores
+ * in the kernel.
+ */
+
+
+DEFINE_SPINLOCK(dma_spin_lock);
+
+/*
+ * If our port doesn't define this it has no PC like DMA
+ */
+
+#ifdef MAX_DMA_CHANNELS
+
+
+/* Channel n is busy iff dma_chan_busy[n].lock != 0.
+ * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
+ * DMA4 is reserved for cascading.
+ */
+
+struct dma_chan {
+ int lock;
+ const char *device_id;
+};
+
+static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
+ [4] = { 1, "cascade" },
+};
+
+
+int request_dma(unsigned int dmanr, const char * device_id)
+{
+ if (dmanr >= MAX_DMA_CHANNELS)
+ return -EINVAL;
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
+ return -EBUSY;
+
+ dma_chan_busy[dmanr].device_id = device_id;
+
+ /* old flag was 0, now contains 1 to indicate busy */
+ return 0;
+} /* request_dma */
+
+
+void free_dma(unsigned int dmanr)
+{
+ if (dmanr >= MAX_DMA_CHANNELS) {
+ printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
+ return;
+ }
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
+ printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
+ return;
+ }
+
+} /* free_dma */
+
+#else
+
+int request_dma(unsigned int dmanr, const char *device_id)
+{
+ return -EINVAL;
+}
+
+void free_dma(unsigned int dmanr)
+{
+}
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef MAX_DMA_CHANNELS
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
+ if (dma_chan_busy[i].lock) {
+ seq_printf(m, "%2d: %s\n", i,
+ dma_chan_busy[i].device_id);
+ }
+ }
+ return 0;
+}
+#else
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "No DMA\n");
+ return 0;
+}
+#endif /* MAX_DMA_CHANNELS */
+
+static int proc_dma_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_dma_show, NULL);
+}
+
+static struct file_operations proc_dma_operations = {
+ .open = proc_dma_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_dma_init(void)
+{
+ struct proc_dir_entry *e;
+
+ e = create_proc_entry("dma", 0, NULL);
+ if (e)
+ e->proc_fops = &proc_dma_operations;
+
+ return 0;
+}
+
+__initcall(proc_dma_init);
+#endif
+
+EXPORT_SYMBOL(request_dma);
+EXPORT_SYMBOL(free_dma);
+EXPORT_SYMBOL(dma_spin_lock);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
new file mode 100644
index 000000000000..867d6dbeb574
--- /dev/null
+++ b/kernel/exec_domain.c
@@ -0,0 +1,209 @@
+/*
+ * Handling of different ABIs (personalities).
+ *
+ * We group personalities into execution domains which have their
+ * own handlers for kernel entry points, signal mapping, etc...
+ *
+ * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+
+
+static void default_handler(int, struct pt_regs *);
+
+static struct exec_domain *exec_domains = &default_exec_domain;
+static DEFINE_RWLOCK(exec_domains_lock);
+
+
+static u_long ident_map[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31
+};
+
+struct exec_domain default_exec_domain = {
+ .name = "Linux", /* name */
+ .handler = default_handler, /* lcall7 causes a seg fault. */
+ .pers_low = 0, /* PER_LINUX personality. */
+ .pers_high = 0, /* PER_LINUX personality. */
+ .signal_map = ident_map, /* Identity map signals. */
+ .signal_invmap = ident_map, /* - both ways. */
+};
+
+
+static void
+default_handler(int segment, struct pt_regs *regp)
+{
+ set_personality(0);
+
+ if (current_thread_info()->exec_domain->handler != default_handler)
+ current_thread_info()->exec_domain->handler(segment, regp);
+ else
+ send_sig(SIGSEGV, current, 1);
+}
+
+static struct exec_domain *
+lookup_exec_domain(u_long personality)
+{
+ struct exec_domain * ep;
+ u_long pers = personality(personality);
+
+ read_lock(&exec_domains_lock);
+ for (ep = exec_domains; ep; ep = ep->next) {
+ if (pers >= ep->pers_low && pers <= ep->pers_high)
+ if (try_module_get(ep->module))
+ goto out;
+ }
+
+#ifdef CONFIG_KMOD
+ read_unlock(&exec_domains_lock);
+ request_module("personality-%ld", pers);
+ read_lock(&exec_domains_lock);
+
+ for (ep = exec_domains; ep; ep = ep->next) {
+ if (pers >= ep->pers_low && pers <= ep->pers_high)
+ if (try_module_get(ep->module))
+ goto out;
+ }
+#endif
+
+ ep = &default_exec_domain;
+out:
+ read_unlock(&exec_domains_lock);
+ return (ep);
+}
+
+int
+register_exec_domain(struct exec_domain *ep)
+{
+ struct exec_domain *tmp;
+ int err = -EBUSY;
+
+ if (ep == NULL)
+ return -EINVAL;
+
+ if (ep->next != NULL)
+ return -EBUSY;
+
+ write_lock(&exec_domains_lock);
+ for (tmp = exec_domains; tmp; tmp = tmp->next) {
+ if (tmp == ep)
+ goto out;
+ }
+
+ ep->next = exec_domains;
+ exec_domains = ep;
+ err = 0;
+
+out:
+ write_unlock(&exec_domains_lock);
+ return (err);
+}
+
+int
+unregister_exec_domain(struct exec_domain *ep)
+{
+ struct exec_domain **epp;
+
+ epp = &exec_domains;
+ write_lock(&exec_domains_lock);
+ for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
+ if (ep == *epp)
+ goto unregister;
+ }
+ write_unlock(&exec_domains_lock);
+ return -EINVAL;
+
+unregister:
+ *epp = ep->next;
+ ep->next = NULL;
+ write_unlock(&exec_domains_lock);
+ return 0;
+}
+
+int
+__set_personality(u_long personality)
+{
+ struct exec_domain *ep, *oep;
+
+ ep = lookup_exec_domain(personality);
+ if (ep == current_thread_info()->exec_domain) {
+ current->personality = personality;
+ return 0;
+ }
+
+ if (atomic_read(&current->fs->count) != 1) {
+ struct fs_struct *fsp, *ofsp;
+
+ fsp = copy_fs_struct(current->fs);
+ if (fsp == NULL) {
+ module_put(ep->module);
+ return -ENOMEM;
+ }
+
+ task_lock(current);
+ ofsp = current->fs;
+ current->fs = fsp;
+ task_unlock(current);
+
+ put_fs_struct(ofsp);
+ }
+
+ /*
+ * At that point we are guaranteed to be the sole owner of
+ * current->fs.
+ */
+
+ current->personality = personality;
+ oep = current_thread_info()->exec_domain;
+ current_thread_info()->exec_domain = ep;
+ set_fs_altroot();
+
+ module_put(oep->module);
+ return 0;
+}
+
+int
+get_exec_domain_list(char *page)
+{
+ struct exec_domain *ep;
+ int len = 0;
+
+ read_lock(&exec_domains_lock);
+ for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
+ len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
+ ep->pers_low, ep->pers_high, ep->name,
+ module_name(ep->module));
+ read_unlock(&exec_domains_lock);
+ return (len);
+}
+
+asmlinkage long
+sys_personality(u_long personality)
+{
+ u_long old = current->personality;
+
+ if (personality != 0xffffffff) {
+ set_personality(personality);
+ if (current->personality != personality)
+ return -EINVAL;
+ }
+
+ return (long)old;
+}
+
+
+EXPORT_SYMBOL(register_exec_domain);
+EXPORT_SYMBOL(unregister_exec_domain);
+EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
new file mode 100644
index 000000000000..6dd4ebe1dd90
--- /dev/null
+++ b/kernel/exit.c
@@ -0,0 +1,1527 @@
+/*
+ * linux/kernel/exit.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/namespace.h>
+#include <linux/key.h>
+#include <linux/security.h>
+#include <linux/cpu.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+int getrusage(struct task_struct *, int, struct rusage __user *);
+
+static void __unhash_process(struct task_struct *p)
+{
+ nr_threads--;
+ detach_pid(p, PIDTYPE_PID);
+ detach_pid(p, PIDTYPE_TGID);
+ if (thread_group_leader(p)) {
+ detach_pid(p, PIDTYPE_PGID);
+ detach_pid(p, PIDTYPE_SID);
+ if (p->pid)
+ __get_cpu_var(process_counts)--;
+ }
+
+ REMOVE_LINKS(p);
+}
+
+void release_task(struct task_struct * p)
+{
+ int zap_leader;
+ task_t *leader;
+ struct dentry *proc_dentry;
+
+repeat:
+ atomic_dec(&p->user->processes);
+ spin_lock(&p->proc_lock);
+ proc_dentry = proc_pid_unhash(p);
+ write_lock_irq(&tasklist_lock);
+ if (unlikely(p->ptrace))
+ __ptrace_unlink(p);
+ BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+ __exit_signal(p);
+ __exit_sighand(p);
+ __unhash_process(p);
+
+ /*
+ * If we are the last non-leader member of the thread
+ * group, and the leader is zombie, then notify the
+ * group leader's parent process. (if it wants notification.)
+ */
+ zap_leader = 0;
+ leader = p->group_leader;
+ if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+ BUG_ON(leader->exit_signal == -1);
+ do_notify_parent(leader, leader->exit_signal);
+ /*
+ * If we were the last child thread and the leader has
+ * exited already, and the leader's parent ignores SIGCHLD,
+ * then we are the one who should release the leader.
+ *
+ * do_notify_parent() will have marked it self-reaping in
+ * that case.
+ */
+ zap_leader = (leader->exit_signal == -1);
+ }
+
+ sched_exit(p);
+ write_unlock_irq(&tasklist_lock);
+ spin_unlock(&p->proc_lock);
+ proc_pid_flush(proc_dentry);
+ release_thread(p);
+ put_task_struct(p);
+
+ p = leader;
+ if (unlikely(zap_leader))
+ goto repeat;
+}
+
+/* we are using it only for SMP init */
+
+void unhash_process(struct task_struct *p)
+{
+ struct dentry *proc_dentry;
+
+ spin_lock(&p->proc_lock);
+ proc_dentry = proc_pid_unhash(p);
+ write_lock_irq(&tasklist_lock);
+ __unhash_process(p);
+ write_unlock_irq(&tasklist_lock);
+ spin_unlock(&p->proc_lock);
+ proc_pid_flush(proc_dentry);
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+ struct task_struct *p;
+ int sid = -1;
+
+ read_lock(&tasklist_lock);
+ do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+ if (p->signal->session > 0) {
+ sid = p->signal->session;
+ goto out;
+ }
+ } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+ p = find_task_by_pid(pgrp);
+ if (p)
+ sid = p->signal->session;
+out:
+ read_unlock(&tasklist_lock);
+
+ return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52. Orphaned process groups are not to be affected
+ * by terminal-generated stop signals. Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+{
+ struct task_struct *p;
+ int ret = 1;
+
+ do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+ if (p == ignored_task
+ || p->exit_state
+ || p->real_parent->pid == 1)
+ continue;
+ if (process_group(p->real_parent) != pgrp
+ && p->real_parent->signal->session == p->signal->session) {
+ ret = 0;
+ break;
+ }
+ } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+ return ret; /* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+ int retval;
+
+ read_lock(&tasklist_lock);
+ retval = will_become_orphaned_pgrp(pgrp, NULL);
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+static inline int has_stopped_jobs(int pgrp)
+{
+ int retval = 0;
+ struct task_struct *p;
+
+ do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+ if (p->state != TASK_STOPPED)
+ continue;
+
+ /* If p is stopped by a debugger on a signal that won't
+ stop it, then don't count p as stopped. This isn't
+ perfect but it's a good approximation. */
+ if (unlikely (p->ptrace)
+ && p->exit_code != SIGSTOP
+ && p->exit_code != SIGTSTP
+ && p->exit_code != SIGTTOU
+ && p->exit_code != SIGTTIN)
+ continue;
+
+ retval = 1;
+ break;
+ } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+ return retval;
+}
+
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+ write_lock_irq(&tasklist_lock);
+
+ ptrace_unlink(current);
+ /* Reparent to init */
+ REMOVE_LINKS(current);
+ current->parent = child_reaper;
+ current->real_parent = child_reaper;
+ SET_LINKS(current);
+
+ /* Set the exit signal to SIGCHLD so we signal init on exit */
+ current->exit_signal = SIGCHLD;
+
+ if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+ set_user_nice(current, 0);
+ /* cpus_allowed? */
+ /* rt_priority? */
+ /* signals? */
+ security_task_reparent_to_init(current);
+ memcpy(current->signal->rlim, init_task.signal->rlim,
+ sizeof(current->signal->rlim));
+ atomic_inc(&(INIT_USER->__count));
+ write_unlock_irq(&tasklist_lock);
+ switch_uid(INIT_USER);
+}
+
+void __set_special_pids(pid_t session, pid_t pgrp)
+{
+ struct task_struct *curr = current;
+
+ if (curr->signal->session != session) {
+ detach_pid(curr, PIDTYPE_SID);
+ curr->signal->session = session;
+ attach_pid(curr, PIDTYPE_SID, session);
+ }
+ if (process_group(curr) != pgrp) {
+ detach_pid(curr, PIDTYPE_PGID);
+ curr->signal->pgrp = pgrp;
+ attach_pid(curr, PIDTYPE_PGID, pgrp);
+ }
+}
+
+void set_special_pids(pid_t session, pid_t pgrp)
+{
+ write_lock_irq(&tasklist_lock);
+ __set_special_pids(session, pgrp);
+ write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+ if (sig < 1 || sig > _NSIG)
+ return -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigdelset(&current->blocked, sig);
+ if (!current->mm) {
+ /* Kernel threads handle their own signals.
+ Let the signal code know it'll be handled, so
+ that they don't get converted to SIGKILL or
+ just silently dropped */
+ current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
+ }
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+ if (sig < 1 || sig > _NSIG)
+ return -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigaddset(&current->blocked, sig);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ * Put all the gunge required to become a kernel thread without
+ * attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+ va_list args;
+ struct fs_struct *fs;
+ sigset_t blocked;
+
+ va_start(args, name);
+ vsnprintf(current->comm, sizeof(current->comm), name, args);
+ va_end(args);
+
+ /*
+ * If we were started as result of loading a module, close all of the
+ * user space pages. We don't need them, and if we didn't close them
+ * they would be locked into memory.
+ */
+ exit_mm(current);
+
+ set_special_pids(1, 1);
+ down(&tty_sem);
+ current->signal->tty = NULL;
+ up(&tty_sem);
+
+ /* Block and flush all signals */
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ /* Become as one with the init task */
+
+ exit_fs(current); /* current->fs->count--; */
+ fs = init_task.fs;
+ current->fs = fs;
+ atomic_inc(&fs->count);
+ exit_files(current);
+ current->files = init_task.files;
+ atomic_inc(&current->files->count);
+
+ reparent_to_init();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static inline void close_files(struct files_struct * files)
+{
+ int i, j;
+
+ j = 0;
+ for (;;) {
+ unsigned long set;
+ i = j * __NFDBITS;
+ if (i >= files->max_fdset || i >= files->max_fds)
+ break;
+ set = files->open_fds->fds_bits[j++];
+ while (set) {
+ if (set & 1) {
+ struct file * file = xchg(&files->fd[i], NULL);
+ if (file)
+ filp_close(file, files);
+ }
+ i++;
+ set >>= 1;
+ }
+ }
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+ struct files_struct *files;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ atomic_inc(&files->count);
+ task_unlock(task);
+
+ return files;
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+ if (atomic_dec_and_test(&files->count)) {
+ close_files(files);
+ /*
+ * Free the fd and fdset arrays if we expanded them.
+ */
+ if (files->fd != &files->fd_array[0])
+ free_fd_array(files->fd, files->max_fds);
+ if (files->max_fdset > __FD_SETSIZE) {
+ free_fdset(files->open_fds, files->max_fdset);
+ free_fdset(files->close_on_exec, files->max_fdset);
+ }
+ kmem_cache_free(files_cachep, files);
+ }
+}
+
+EXPORT_SYMBOL(put_files_struct);
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+ struct files_struct * files = tsk->files;
+
+ if (files) {
+ task_lock(tsk);
+ tsk->files = NULL;
+ task_unlock(tsk);
+ put_files_struct(files);
+ }
+}
+
+void exit_files(struct task_struct *tsk)
+{
+ __exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+ /* No need to hold fs->lock if we are killing it */
+ if (atomic_dec_and_test(&fs->count)) {
+ dput(fs->root);
+ mntput(fs->rootmnt);
+ dput(fs->pwd);
+ mntput(fs->pwdmnt);
+ if (fs->altroot) {
+ dput(fs->altroot);
+ mntput(fs->altrootmnt);
+ }
+ kmem_cache_free(fs_cachep, fs);
+ }
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+ __put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+ struct fs_struct * fs = tsk->fs;
+
+ if (fs) {
+ task_lock(tsk);
+ tsk->fs = NULL;
+ task_unlock(tsk);
+ __put_fs_struct(fs);
+ }
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+ __exit_fs(tsk);
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+void exit_mm(struct task_struct * tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+
+ mm_release(tsk, mm);
+ if (!mm)
+ return;
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold mmap_sem around checking core_waiters
+ * and clearing tsk->mm. The core-inducing thread
+ * will increment core_waiters for each thread in the
+ * group with ->mm != NULL.
+ */
+ down_read(&mm->mmap_sem);
+ if (mm->core_waiters) {
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ if (!--mm->core_waiters)
+ complete(mm->core_startup_done);
+ up_write(&mm->mmap_sem);
+
+ wait_for_completion(&mm->core_done);
+ down_read(&mm->mmap_sem);
+ }
+ atomic_inc(&mm->mm_count);
+ if (mm != tsk->active_mm) BUG();
+ /* more a memory barrier than a real lock */
+ task_lock(tsk);
+ tsk->mm = NULL;
+ up_read(&mm->mmap_sem);
+ enter_lazy_tlb(mm, current);
+ task_unlock(tsk);
+ mmput(mm);
+}
+
+static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+{
+ /*
+ * Make sure we're not reparenting to ourselves and that
+ * the parent is not a zombie.
+ */
+ BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+ p->real_parent = reaper;
+ if (p->parent == p->real_parent)
+ BUG();
+}
+
+static inline void reparent_thread(task_t *p, task_t *father, int traced)
+{
+ /* We don't want people slaying init. */
+ if (p->exit_signal != -1)
+ p->exit_signal = SIGCHLD;
+
+ if (p->pdeath_signal)
+ /* We already hold the tasklist_lock here. */
+ group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+
+ /* Move the child from its dying parent to the new one. */
+ if (unlikely(traced)) {
+ /* Preserve ptrace links if someone else is tracing this child. */
+ list_del_init(&p->ptrace_list);
+ if (p->parent != p->real_parent)
+ list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+ } else {
+ /* If this child is being traced, then we're the one tracing it
+ * anyway, so let go of it.
+ */
+ p->ptrace = 0;
+ list_del_init(&p->sibling);
+ p->parent = p->real_parent;
+ list_add_tail(&p->sibling, &p->parent->children);
+
+ /* If we'd notified the old parent about this child's death,
+ * also notify the new parent.
+ */
+ if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+ thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ else if (p->state == TASK_TRACED) {
+ /*
+ * If it was at a trace stop, turn it into
+ * a normal stop since it's no longer being
+ * traced.
+ */
+ ptrace_untrace(p);
+ }
+ }
+
+ /*
+ * process group orphan check
+ * Case ii: Our child is in a different pgrp
+ * than we are, and it was the only connection
+ * outside, so the child pgrp is now orphaned.
+ */
+ if ((process_group(p) != process_group(father)) &&
+ (p->signal->session == father->signal->session)) {
+ int pgrp = process_group(p);
+
+ if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+ __kill_pg_info(SIGHUP, (void *)1, pgrp);
+ __kill_pg_info(SIGCONT, (void *)1, pgrp);
+ }
+ }
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static inline void forget_original_parent(struct task_struct * father,
+ struct list_head *to_release)
+{
+ struct task_struct *p, *reaper = father;
+ struct list_head *_p, *_n;
+
+ do {
+ reaper = next_thread(reaper);
+ if (reaper == father) {
+ reaper = child_reaper;
+ break;
+ }
+ } while (reaper->exit_state);
+
+ /*
+ * There are only two places where our children can be:
+ *
+ * - in our child list
+ * - in our ptraced child list
+ *
+ * Search them and reparent children.
+ */
+ list_for_each_safe(_p, _n, &father->children) {
+ int ptrace;
+ p = list_entry(_p,struct task_struct,sibling);
+
+ ptrace = p->ptrace;
+
+ /* if father isn't the real parent, then ptrace must be enabled */
+ BUG_ON(father != p->real_parent && !ptrace);
+
+ if (father == p->real_parent) {
+ /* reparent with a reaper, real father it's us */
+ choose_new_parent(p, reaper, child_reaper);
+ reparent_thread(p, father, 0);
+ } else {
+ /* reparent ptraced task to its real parent */
+ __ptrace_unlink (p);
+ if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+ thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ }
+
+ /*
+ * if the ptraced child is a zombie with exit_signal == -1
+ * we must collect it before we exit, or it will remain
+ * zombie forever since we prevented it from self-reap itself
+ * while it was being traced by us, to be able to see it in wait4.
+ */
+ if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
+ list_add(&p->ptrace_list, to_release);
+ }
+ list_for_each_safe(_p, _n, &father->ptrace_children) {
+ p = list_entry(_p,struct task_struct,ptrace_list);
+ choose_new_parent(p, reaper, child_reaper);
+ reparent_thread(p, father, 1);
+ }
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk)
+{
+ int state;
+ struct task_struct *t;
+ struct list_head ptrace_dead, *_p, *_n;
+
+ if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
+ && !thread_group_empty(tsk)) {
+ /*
+ * This occurs when there was a race between our exit
+ * syscall and a group signal choosing us as the one to
+ * wake up. It could be that we are the only thread
+ * alerted to check for pending signals, but another thread
+ * should be woken now to take the signal since we will not.
+ * Now we'll wake all the threads in the group just to make
+ * sure someone gets all the pending signals.
+ */
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ for (t = next_thread(tsk); t != tsk; t = next_thread(t))
+ if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+ recalc_sigpending_tsk(t);
+ if (signal_pending(t))
+ signal_wake_up(t, 0);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ }
+
+ write_lock_irq(&tasklist_lock);
+
+ /*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+
+ INIT_LIST_HEAD(&ptrace_dead);
+ forget_original_parent(tsk, &ptrace_dead);
+ BUG_ON(!list_empty(&tsk->children));
+ BUG_ON(!list_empty(&tsk->ptrace_children));
+
+ /*
+ * Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ *
+ * Case i: Our father is in a different pgrp than we are
+ * and we were the only connection outside, so our pgrp
+ * is about to become orphaned.
+ */
+
+ t = tsk->real_parent;
+
+ if ((process_group(t) != process_group(tsk)) &&
+ (t->signal->session == tsk->signal->session) &&
+ will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+ has_stopped_jobs(process_group(tsk))) {
+ __kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+ __kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+ }
+
+ /* Let father know we died
+ *
+ * Thread signals are configurable, but you aren't going to use
+ * that to send signals to arbitary processes.
+ * That stops right now.
+ *
+ * If the parent exec id doesn't match the exec id we saved
+ * when we started then we know the parent has changed security
+ * domain.
+ *
+ * If our self_exec id doesn't match our parent_exec_id then
+ * we have changed execution domain as these two values started
+ * the same after a fork.
+ *
+ */
+
+ if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+ ( tsk->parent_exec_id != t->self_exec_id ||
+ tsk->self_exec_id != tsk->parent_exec_id)
+ && !capable(CAP_KILL))
+ tsk->exit_signal = SIGCHLD;
+
+
+ /* If something other than our normal parent is ptracing us, then
+ * send it a SIGCHLD instead of honoring exit_signal. exit_signal
+ * only has special meaning to our real parent.
+ */
+ if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+ int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+ do_notify_parent(tsk, signal);
+ } else if (tsk->ptrace) {
+ do_notify_parent(tsk, SIGCHLD);
+ }
+
+ state = EXIT_ZOMBIE;
+ if (tsk->exit_signal == -1 &&
+ (likely(tsk->ptrace == 0) ||
+ unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
+ state = EXIT_DEAD;
+ tsk->exit_state = state;
+
+ write_unlock_irq(&tasklist_lock);
+
+ list_for_each_safe(_p, _n, &ptrace_dead) {
+ list_del_init(_p);
+ t = list_entry(_p,struct task_struct,ptrace_list);
+ release_task(t);
+ }
+
+ /* If the process is dead, release it - nobody will wait for it */
+ if (state == EXIT_DEAD)
+ release_task(tsk);
+
+ /* PF_DEAD causes final put_task_struct after we schedule. */
+ preempt_disable();
+ tsk->flags |= PF_DEAD;
+}
+
+fastcall NORET_TYPE void do_exit(long code)
+{
+ struct task_struct *tsk = current;
+ int group_dead;
+
+ profile_task_exit(tsk);
+
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+ if (unlikely(tsk->pid == 1))
+ panic("Attempted to kill init!");
+ if (tsk->io_context)
+ exit_io_context();
+
+ if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+ current->ptrace_message = code;
+ ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+ }
+
+ tsk->flags |= PF_EXITING;
+
+ /*
+ * Make sure we don't try to process any timer firings
+ * while we are already exiting.
+ */
+ tsk->it_virt_expires = cputime_zero;
+ tsk->it_prof_expires = cputime_zero;
+ tsk->it_sched_expires = 0;
+
+ if (unlikely(in_atomic()))
+ printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+ current->comm, current->pid,
+ preempt_count());
+
+ acct_update_integrals(tsk);
+ update_mem_hiwater(tsk);
+ group_dead = atomic_dec_and_test(&tsk->signal->live);
+ if (group_dead) {
+ del_timer_sync(&tsk->signal->real_timer);
+ acct_process(code);
+ }
+ exit_mm(tsk);
+
+ exit_sem(tsk);
+ __exit_files(tsk);
+ __exit_fs(tsk);
+ exit_namespace(tsk);
+ exit_thread();
+ cpuset_exit(tsk);
+ exit_keys(tsk);
+
+ if (group_dead && tsk->signal->leader)
+ disassociate_ctty(1);
+
+ module_put(tsk->thread_info->exec_domain->module);
+ if (tsk->binfmt)
+ module_put(tsk->binfmt->module);
+
+ tsk->exit_code = code;
+ exit_notify(tsk);
+#ifdef CONFIG_NUMA
+ mpol_free(tsk->mempolicy);
+ tsk->mempolicy = NULL;
+#endif
+
+ BUG_ON(!(current->flags & PF_DEAD));
+ schedule();
+ BUG();
+ /* Avoid "noreturn function does return". */
+ for (;;) ;
+}
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+asmlinkage long sys_exit(int error_code)
+{
+ do_exit((error_code&0xff)<<8);
+}
+
+task_t fastcall *next_thread(const task_t *p)
+{
+ return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
+}
+
+EXPORT_SYMBOL(next_thread);
+
+/*
+ * Take down every thread in the group. This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+ BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+ if (current->signal->flags & SIGNAL_GROUP_EXIT)
+ exit_code = current->signal->group_exit_code;
+ else if (!thread_group_empty(current)) {
+ struct signal_struct *const sig = current->signal;
+ struct sighand_struct *const sighand = current->sighand;
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&sighand->siglock);
+ if (sig->flags & SIGNAL_GROUP_EXIT)
+ /* Another thread got here before we took the lock. */
+ exit_code = sig->group_exit_code;
+ else {
+ sig->flags = SIGNAL_GROUP_EXIT;
+ sig->group_exit_code = exit_code;
+ zap_other_threads(current);
+ }
+ spin_unlock_irq(&sighand->siglock);
+ read_unlock(&tasklist_lock);
+ }
+
+ do_exit(exit_code);
+ /* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+asmlinkage void sys_exit_group(int error_code)
+{
+ do_group_exit((error_code & 0xff) << 8);
+}
+
+static int eligible_child(pid_t pid, int options, task_t *p)
+{
+ if (pid > 0) {
+ if (p->pid != pid)
+ return 0;
+ } else if (!pid) {
+ if (process_group(p) != process_group(current))
+ return 0;
+ } else if (pid != -1) {
+ if (process_group(p) != -pid)
+ return 0;
+ }
+
+ /*
+ * Do not consider detached threads that are
+ * not ptraced:
+ */
+ if (p->exit_signal == -1 && !p->ptrace)
+ return 0;
+
+ /* Wait for all children (clone and not) if __WALL is set;
+ * otherwise, wait for clone children *only* if __WCLONE is
+ * set; otherwise, wait for non-clone children *only*. (Note:
+ * A "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD.) */
+ if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+ && !(options & __WALL))
+ return 0;
+ /*
+ * Do not consider thread group leaders that are
+ * in a non-empty thread group:
+ */
+ if (current->tgid != p->tgid && delay_group_leader(p))
+ return 2;
+
+ if (security_task_wait(p))
+ return 0;
+
+ return 1;
+}
+
+static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
+ int why, int status,
+ struct siginfo __user *infop,
+ struct rusage __user *rusagep)
+{
+ int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+ put_task_struct(p);
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ if (!retval)
+ retval = pid;
+ return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(task_t *p, int noreap,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ unsigned long state;
+ int retval;
+ int status;
+
+ if (unlikely(noreap)) {
+ pid_t pid = p->pid;
+ uid_t uid = p->uid;
+ int exit_code = p->exit_code;
+ int why, status;
+
+ if (unlikely(p->exit_state != EXIT_ZOMBIE))
+ return 0;
+ if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
+ return 0;
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+ if ((exit_code & 0x7f) == 0) {
+ why = CLD_EXITED;
+ status = exit_code >> 8;
+ } else {
+ why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ status = exit_code & 0x7f;
+ }
+ return wait_noreap_copyout(p, pid, uid, why,
+ status, infop, ru);
+ }
+
+ /*
+ * Try to move the task's state to DEAD
+ * only one thread is allowed to do this:
+ */
+ state = xchg(&p->exit_state, EXIT_DEAD);
+ if (state != EXIT_ZOMBIE) {
+ BUG_ON(state != EXIT_DEAD);
+ return 0;
+ }
+ if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
+ /*
+ * This can only happen in a race with a ptraced thread
+ * dying on another processor.
+ */
+ return 0;
+ }
+
+ if (likely(p->real_parent == p->parent) && likely(p->signal)) {
+ /*
+ * The resource counters for the group leader are in its
+ * own task_struct. Those for dead threads in the group
+ * are in its signal_struct, as are those for the child
+ * processes it has previously reaped. All these
+ * accumulate in the parent's signal_struct c* fields.
+ *
+ * We don't bother to take a lock here to protect these
+ * p->signal fields, because they are only touched by
+ * __exit_signal, which runs with tasklist_lock
+ * write-locked anyway, and so is excluded here. We do
+ * need to protect the access to p->parent->signal fields,
+ * as other threads in the parent group can be right
+ * here reaping other children at the same time.
+ */
+ spin_lock_irq(&p->parent->sighand->siglock);
+ p->parent->signal->cutime =
+ cputime_add(p->parent->signal->cutime,
+ cputime_add(p->utime,
+ cputime_add(p->signal->utime,
+ p->signal->cutime)));
+ p->parent->signal->cstime =
+ cputime_add(p->parent->signal->cstime,
+ cputime_add(p->stime,
+ cputime_add(p->signal->stime,
+ p->signal->cstime)));
+ p->parent->signal->cmin_flt +=
+ p->min_flt + p->signal->min_flt + p->signal->cmin_flt;
+ p->parent->signal->cmaj_flt +=
+ p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt;
+ p->parent->signal->cnvcsw +=
+ p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw;
+ p->parent->signal->cnivcsw +=
+ p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw;
+ spin_unlock_irq(&p->parent->sighand->siglock);
+ }
+
+ /*
+ * Now we are sure this task is interesting, and no other
+ * thread can reap it because we set its state to EXIT_DEAD.
+ */
+ read_unlock(&tasklist_lock);
+
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
+ if (!retval && stat_addr)
+ retval = put_user(status, stat_addr);
+ if (!retval && infop)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval && infop)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval && infop) {
+ int why;
+
+ if ((status & 0x7f) == 0) {
+ why = CLD_EXITED;
+ status >>= 8;
+ } else {
+ why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ status &= 0x7f;
+ }
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
+ if (!retval && infop)
+ retval = put_user(p->pid, &infop->si_pid);
+ if (!retval && infop)
+ retval = put_user(p->uid, &infop->si_uid);
+ if (retval) {
+ // TODO: is this safe?
+ p->exit_state = EXIT_ZOMBIE;
+ return retval;
+ }
+ retval = p->pid;
+ if (p->real_parent != p->parent) {
+ write_lock_irq(&tasklist_lock);
+ /* Double-check with lock held. */
+ if (p->real_parent != p->parent) {
+ __ptrace_unlink(p);
+ // TODO: is this safe?
+ p->exit_state = EXIT_ZOMBIE;
+ /*
+ * If this is not a detached task, notify the parent.
+ * If it's still not detached after that, don't release
+ * it now.
+ */
+ if (p->exit_signal != -1) {
+ do_notify_parent(p, p->exit_signal);
+ if (p->exit_signal != -1)
+ p = NULL;
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+ }
+ if (p != NULL)
+ release_task(p);
+ BUG_ON(!retval);
+ return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int retval, exit_code;
+
+ if (!p->exit_code)
+ return 0;
+ if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
+ p->signal && p->signal->group_stop_count > 0)
+ /*
+ * A group stop is in progress and this is the group leader.
+ * We won't report until all threads have stopped.
+ */
+ return 0;
+
+ /*
+ * Now we are pretty sure this task is interesting.
+ * Make sure it doesn't get reaped out from under us while we
+ * give up the lock and then examine it below. We don't want to
+ * keep holding onto the tasklist_lock while we call getrusage and
+ * possibly take page faults for user memory.
+ */
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ if (unlikely(noreap)) {
+ pid_t pid = p->pid;
+ uid_t uid = p->uid;
+ int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+
+ exit_code = p->exit_code;
+ if (unlikely(!exit_code) ||
+ unlikely(p->state > TASK_STOPPED))
+ goto bail_ref;
+ return wait_noreap_copyout(p, pid, uid,
+ why, (exit_code << 8) | 0x7f,
+ infop, ru);
+ }
+
+ write_lock_irq(&tasklist_lock);
+
+ /*
+ * This uses xchg to be atomic with the thread resuming and setting
+ * it. It must also be done with the write lock held to prevent a
+ * race with the EXIT_ZOMBIE case.
+ */
+ exit_code = xchg(&p->exit_code, 0);
+ if (unlikely(p->exit_state)) {
+ /*
+ * The task resumed and then died. Let the next iteration
+ * catch it in EXIT_ZOMBIE. Note that exit_code might
+ * already be zero here if it resumed and did _exit(0).
+ * The task itself is dead and won't touch exit_code again;
+ * other processors in this function are locked out.
+ */
+ p->exit_code = exit_code;
+ exit_code = 0;
+ }
+ if (unlikely(exit_code == 0)) {
+ /*
+ * Another thread in this function got to it first, or it
+ * resumed, or it resumed and then died.
+ */
+ write_unlock_irq(&tasklist_lock);
+bail_ref:
+ put_task_struct(p);
+ /*
+ * We are returning to the wait loop without having successfully
+ * removed the process and having released the lock. We cannot
+ * continue, since the "p" task pointer is potentially stale.
+ *
+ * Return -EAGAIN, and do_wait() will restart the loop from the
+ * beginning. Do _not_ re-acquire the lock.
+ */
+ return -EAGAIN;
+ }
+
+ /* move to end of parent's list to avoid starvation */
+ remove_parent(p);
+ add_parent(p, p->parent);
+
+ write_unlock_irq(&tasklist_lock);
+
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!retval && stat_addr)
+ retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+ if (!retval && infop)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval && infop)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval && infop)
+ retval = put_user((short)((p->ptrace & PT_PTRACED)
+ ? CLD_TRAPPED : CLD_STOPPED),
+ &infop->si_code);
+ if (!retval && infop)
+ retval = put_user(exit_code, &infop->si_status);
+ if (!retval && infop)
+ retval = put_user(p->pid, &infop->si_pid);
+ if (!retval && infop)
+ retval = put_user(p->uid, &infop->si_uid);
+ if (!retval)
+ retval = p->pid;
+ put_task_struct(p);
+
+ BUG_ON(!retval);
+ return retval;
+}
+
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(task_t *p, int noreap,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int retval;
+ pid_t pid;
+ uid_t uid;
+
+ if (unlikely(!p->signal))
+ return 0;
+
+ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
+ return 0;
+
+ spin_lock_irq(&p->sighand->siglock);
+ /* Re-check with the lock held. */
+ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
+ spin_unlock_irq(&p->sighand->siglock);
+ return 0;
+ }
+ if (!noreap)
+ p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+ spin_unlock_irq(&p->sighand->siglock);
+
+ pid = p->pid;
+ uid = p->uid;
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ if (!infop) {
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ put_task_struct(p);
+ if (!retval && stat_addr)
+ retval = put_user(0xffff, stat_addr);
+ if (!retval)
+ retval = p->pid;
+ } else {
+ retval = wait_noreap_copyout(p, pid, uid,
+ CLD_CONTINUED, SIGCONT,
+ infop, ru);
+ BUG_ON(retval == 0);
+ }
+
+ return retval;
+}
+
+
+static inline int my_ptrace_child(struct task_struct *p)
+{
+ if (!(p->ptrace & PT_PTRACED))
+ return 0;
+ if (!(p->ptrace & PT_ATTACHED))
+ return 1;
+ /*
+ * This child was PTRACE_ATTACH'd. We should be seeing it only if
+ * we are the attacher. If we are the real parent, this is a race
+ * inside ptrace_attach. It is waiting for the tasklist_lock,
+ * which we have to switch the parent links, but has already set
+ * the flags in p->ptrace.
+ */
+ return (p->parent != p->real_parent);
+}
+
+static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct task_struct *tsk;
+ int flag, retval;
+
+ add_wait_queue(&current->signal->wait_chldexit,&wait);
+repeat:
+ /*
+ * We will set this flag if we see any child that might later
+ * match our criteria, even if we are not able to reap it yet.
+ */
+ flag = 0;
+ current->state = TASK_INTERRUPTIBLE;
+ read_lock(&tasklist_lock);
+ tsk = current;
+ do {
+ struct task_struct *p;
+ struct list_head *_p;
+ int ret;
+
+ list_for_each(_p,&tsk->children) {
+ p = list_entry(_p,struct task_struct,sibling);
+
+ ret = eligible_child(pid, options, p);
+ if (!ret)
+ continue;
+
+ switch (p->state) {
+ case TASK_TRACED:
+ if (!my_ptrace_child(p))
+ continue;
+ /*FALLTHROUGH*/
+ case TASK_STOPPED:
+ /*
+ * It's stopped now, so it might later
+ * continue, exit, or stop again.
+ */
+ flag = 1;
+ if (!(options & WUNTRACED) &&
+ !my_ptrace_child(p))
+ continue;
+ retval = wait_task_stopped(p, ret == 2,
+ (options & WNOWAIT),
+ infop,
+ stat_addr, ru);
+ if (retval == -EAGAIN)
+ goto repeat;
+ if (retval != 0) /* He released the lock. */
+ goto end;
+ break;
+ default:
+ // case EXIT_DEAD:
+ if (p->exit_state == EXIT_DEAD)
+ continue;
+ // case EXIT_ZOMBIE:
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /*
+ * Eligible but we cannot release
+ * it yet:
+ */
+ if (ret == 2)
+ goto check_continued;
+ if (!likely(options & WEXITED))
+ continue;
+ retval = wait_task_zombie(
+ p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ /* He released the lock. */
+ if (retval != 0)
+ goto end;
+ break;
+ }
+check_continued:
+ /*
+ * It's running now, so it might later
+ * exit, stop, or stop and then continue.
+ */
+ flag = 1;
+ if (!unlikely(options & WCONTINUED))
+ continue;
+ retval = wait_task_continued(
+ p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ if (retval != 0) /* He released the lock. */
+ goto end;
+ break;
+ }
+ }
+ if (!flag) {
+ list_for_each(_p, &tsk->ptrace_children) {
+ p = list_entry(_p, struct task_struct,
+ ptrace_list);
+ if (!eligible_child(pid, options, p))
+ continue;
+ flag = 1;
+ break;
+ }
+ }
+ if (options & __WNOTHREAD)
+ break;
+ tsk = next_thread(tsk);
+ if (tsk->signal != current->signal)
+ BUG();
+ } while (tsk != current);
+
+ read_unlock(&tasklist_lock);
+ if (flag) {
+ retval = 0;
+ if (options & WNOHANG)
+ goto end;
+ retval = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto end;
+ schedule();
+ goto repeat;
+ }
+ retval = -ECHILD;
+end:
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&current->signal->wait_chldexit,&wait);
+ if (infop) {
+ if (retval > 0)
+ retval = 0;
+ else {
+ /*
+ * For a WNOHANG return, clear out all the fields
+ * we would set so the user can easily tell the
+ * difference.
+ */
+ if (!retval)
+ retval = put_user(0, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user(0, &infop->si_code);
+ if (!retval)
+ retval = put_user(0, &infop->si_pid);
+ if (!retval)
+ retval = put_user(0, &infop->si_uid);
+ if (!retval)
+ retval = put_user(0, &infop->si_status);
+ }
+ }
+ return retval;
+}
+
+asmlinkage long sys_waitid(int which, pid_t pid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
+{
+ long ret;
+
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ switch (which) {
+ case P_ALL:
+ pid = -1;
+ break;
+ case P_PID:
+ if (pid <= 0)
+ return -EINVAL;
+ break;
+ case P_PGID:
+ if (pid <= 0)
+ return -EINVAL;
+ pid = -pid;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ret = do_wait(pid, options, infop, NULL, ru);
+
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
+}
+
+asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
+ int options, struct rusage __user *ru)
+{
+ long ret;
+
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+ ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options)
+{
+ return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
new file mode 100644
index 000000000000..7501b531ceed
--- /dev/null
+++ b/kernel/extable.c
@@ -0,0 +1,67 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/sections.h>
+
+extern struct exception_table_entry __start___ex_table[];
+extern struct exception_table_entry __stop___ex_table[];
+
+/* Sort the kernel's built-in exception table */
+void __init sort_main_extable(void)
+{
+ sort_extable(__start___ex_table, __stop___ex_table);
+}
+
+/* Given an address, look for it in the exception tables. */
+const struct exception_table_entry *search_exception_tables(unsigned long addr)
+{
+ const struct exception_table_entry *e;
+
+ e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+ if (!e)
+ e = search_module_extables(addr);
+ return e;
+}
+
+static int core_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext &&
+ addr <= (unsigned long)_etext)
+ return 1;
+
+ if (addr >= (unsigned long)_sinittext &&
+ addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+int __kernel_text_address(unsigned long addr)
+{
+ if (core_kernel_text(addr))
+ return 1;
+ return __module_text_address(addr) != NULL;
+}
+
+int kernel_text_address(unsigned long addr)
+{
+ if (core_kernel_text(addr))
+ return 1;
+ return module_text_address(addr) != NULL;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
new file mode 100644
index 000000000000..f42a17f88699
--- /dev/null
+++ b/kernel/fork.c
@@ -0,0 +1,1274 @@
+/*
+ * linux/kernel/fork.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * 'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/namespace.h>
+#include <linux/personality.h>
+#include <linux/mempolicy.h>
+#include <linux/sem.h>
+#include <linux/file.h>
+#include <linux/key.h>
+#include <linux/binfmts.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/jiffies.h>
+#include <linux/futex.h>
+#include <linux/ptrace.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/profile.h>
+#include <linux/rmap.h>
+#include <linux/acct.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+unsigned long total_forks; /* Handle normal Linux uptimes. */
+int nr_threads; /* The idle threads do not count.. */
+
+int max_threads; /* tunable limit on nr_threads */
+
+DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+
+ __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+
+EXPORT_SYMBOL(tasklist_lock);
+
+int nr_processes(void)
+{
+ int cpu;
+ int total = 0;
+
+ for_each_online_cpu(cpu)
+ total += per_cpu(process_counts, cpu);
+
+ return total;
+}
+
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+static kmem_cache_t *task_struct_cachep;
+#endif
+
+/* SLAB cache for signal_struct structures (tsk->signal) */
+kmem_cache_t *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+kmem_cache_t *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+kmem_cache_t *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+kmem_cache_t *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+kmem_cache_t *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static kmem_cache_t *mm_cachep;
+
+void free_task(struct task_struct *tsk)
+{
+ free_thread_info(tsk->thread_info);
+ free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
+void __put_task_struct(struct task_struct *tsk)
+{
+ WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+ WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(tsk == current);
+
+ if (unlikely(tsk->audit_context))
+ audit_free(tsk);
+ security_task_free(tsk);
+ free_uid(tsk->user);
+ put_group_info(tsk->group_info);
+
+ if (!profile_handoff_task(tsk))
+ free_task(tsk);
+}
+
+void __init fork_init(unsigned long mempages)
+{
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
+#endif
+ /* create a slab on which task_structs can be allocated */
+ task_struct_cachep =
+ kmem_cache_create("task_struct", sizeof(struct task_struct),
+ ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+#endif
+
+ /*
+ * The default maximum number of threads is set to a safe
+ * value: the thread structures can take up at most half
+ * of memory.
+ */
+ max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
+
+ /*
+ * we need to allow at least 20 threads to boot a system
+ */
+ if(max_threads < 20)
+ max_threads = 20;
+
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+ init_task.signal->rlim[RLIMIT_SIGPENDING] =
+ init_task.signal->rlim[RLIMIT_NPROC];
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig)
+{
+ struct task_struct *tsk;
+ struct thread_info *ti;
+
+ prepare_to_copy(orig);
+
+ tsk = alloc_task_struct();
+ if (!tsk)
+ return NULL;
+
+ ti = alloc_thread_info(tsk);
+ if (!ti) {
+ free_task_struct(tsk);
+ return NULL;
+ }
+
+ *ti = *orig->thread_info;
+ *tsk = *orig;
+ tsk->thread_info = ti;
+ ti->task = tsk;
+
+ /* One for us, one for whoever does the "release_task()" (usually parent) */
+ atomic_set(&tsk->usage,2);
+ return tsk;
+}
+
+#ifdef CONFIG_MMU
+static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+{
+ struct vm_area_struct * mpnt, *tmp, **pprev;
+ struct rb_node **rb_link, *rb_parent;
+ int retval;
+ unsigned long charge;
+ struct mempolicy *pol;
+
+ down_write(&oldmm->mmap_sem);
+ flush_cache_mm(current->mm);
+ mm->locked_vm = 0;
+ mm->mmap = NULL;
+ mm->mmap_cache = NULL;
+ mm->free_area_cache = oldmm->mmap_base;
+ mm->map_count = 0;
+ set_mm_counter(mm, rss, 0);
+ set_mm_counter(mm, anon_rss, 0);
+ cpus_clear(mm->cpu_vm_mask);
+ mm->mm_rb = RB_ROOT;
+ rb_link = &mm->mm_rb.rb_node;
+ rb_parent = NULL;
+ pprev = &mm->mmap;
+
+ for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+ struct file *file;
+
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+ -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+ if (security_vm_enough_memory(len))
+ goto fail_nomem;
+ charge = len;
+ }
+ tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!tmp)
+ goto fail_nomem;
+ *tmp = *mpnt;
+ pol = mpol_copy(vma_policy(mpnt));
+ retval = PTR_ERR(pol);
+ if (IS_ERR(pol))
+ goto fail_nomem_policy;
+ vma_set_policy(tmp, pol);
+ tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_mm = mm;
+ tmp->vm_next = NULL;
+ anon_vma_link(tmp);
+ file = tmp->vm_file;
+ if (file) {
+ struct inode *inode = file->f_dentry->d_inode;
+ get_file(file);
+ if (tmp->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+
+ /* insert tmp into the share list, just after mpnt */
+ spin_lock(&file->f_mapping->i_mmap_lock);
+ tmp->vm_truncate_count = mpnt->vm_truncate_count;
+ flush_dcache_mmap_lock(file->f_mapping);
+ vma_prio_tree_add(tmp, mpnt);
+ flush_dcache_mmap_unlock(file->f_mapping);
+ spin_unlock(&file->f_mapping->i_mmap_lock);
+ }
+
+ /*
+ * Link in the new vma and copy the page table entries:
+ * link in first so that swapoff can see swap entries,
+ * and try_to_unmap_one's find_vma find the new vma.
+ */
+ spin_lock(&mm->page_table_lock);
+ *pprev = tmp;
+ pprev = &tmp->vm_next;
+
+ __vma_link_rb(mm, tmp, rb_link, rb_parent);
+ rb_link = &tmp->vm_rb.rb_right;
+ rb_parent = &tmp->vm_rb;
+
+ mm->map_count++;
+ retval = copy_page_range(mm, current->mm, tmp);
+ spin_unlock(&mm->page_table_lock);
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ if (retval)
+ goto out;
+ }
+ retval = 0;
+
+out:
+ flush_tlb_mm(current->mm);
+ up_write(&oldmm->mmap_sem);
+ return retval;
+fail_nomem_policy:
+ kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct * mm)
+{
+ mm->pgd = pgd_alloc(mm);
+ if (unlikely(!mm->pgd))
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct * mm)
+{
+ pgd_free(mm->pgd);
+}
+#else
+#define dup_mmap(mm, oldmm) (0)
+#define mm_alloc_pgd(mm) (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+#include <linux/init_task.h>
+
+static struct mm_struct * mm_init(struct mm_struct * mm)
+{
+ atomic_set(&mm->mm_users, 1);
+ atomic_set(&mm->mm_count, 1);
+ init_rwsem(&mm->mmap_sem);
+ INIT_LIST_HEAD(&mm->mmlist);
+ mm->core_waiters = 0;
+ mm->nr_ptes = 0;
+ spin_lock_init(&mm->page_table_lock);
+ rwlock_init(&mm->ioctx_list_lock);
+ mm->ioctx_list = NULL;
+ mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+
+ if (likely(!mm_alloc_pgd(mm))) {
+ mm->def_flags = 0;
+ return mm;
+ }
+ free_mm(mm);
+ return NULL;
+}
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct * mm_alloc(void)
+{
+ struct mm_struct * mm;
+
+ mm = allocate_mm();
+ if (mm) {
+ memset(mm, 0, sizeof(*mm));
+ mm = mm_init(mm);
+ }
+ return mm;
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void fastcall __mmdrop(struct mm_struct *mm)
+{
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
+ free_mm(mm);
+}
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ exit_aio(mm);
+ exit_mmap(mm);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ put_swap_token(mm);
+ mmdrop(mm);
+ }
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+/**
+ * get_task_mm - acquire a reference to the task's mm
+ *
+ * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
+ * bumping up the use count. User must release the mm via mmput()
+ * after use. Typically used by /proc and ptrace.
+ */
+struct mm_struct *get_task_mm(struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (task->flags & PF_BORROWED_MM)
+ mm = NULL;
+ else
+ atomic_inc(&mm->mm_users);
+ }
+ task_unlock(task);
+ return mm;
+}
+EXPORT_SYMBOL_GPL(get_task_mm);
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one. Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct completion *vfork_done = tsk->vfork_done;
+
+ /* Get rid of any cached register state */
+ deactivate_mm(tsk, mm);
+
+ /* notify parent sleeping on vfork() */
+ if (vfork_done) {
+ tsk->vfork_done = NULL;
+ complete(vfork_done);
+ }
+ if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+ u32 __user * tidptr = tsk->clear_child_tid;
+ tsk->clear_child_tid = NULL;
+
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tidptr);
+ sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
+ }
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct mm_struct * mm, *oldmm;
+ int retval;
+
+ tsk->min_flt = tsk->maj_flt = 0;
+ tsk->nvcsw = tsk->nivcsw = 0;
+
+ tsk->mm = NULL;
+ tsk->active_mm = NULL;
+
+ /*
+ * Are we cloning a kernel thread?
+ *
+ * We need to steal a active VM for that..
+ */
+ oldmm = current->mm;
+ if (!oldmm)
+ return 0;
+
+ if (clone_flags & CLONE_VM) {
+ atomic_inc(&oldmm->mm_users);
+ mm = oldmm;
+ /*
+ * There are cases where the PTL is held to ensure no
+ * new threads start up in user mode using an mm, which
+ * allows optimizing out ipis; the tlb_gather_mmu code
+ * is an example.
+ */
+ spin_unlock_wait(&oldmm->page_table_lock);
+ goto good_mm;
+ }
+
+ retval = -ENOMEM;
+ mm = allocate_mm();
+ if (!mm)
+ goto fail_nomem;
+
+ /* Copy the current MM stuff.. */
+ memcpy(mm, oldmm, sizeof(*mm));
+ if (!mm_init(mm))
+ goto fail_nomem;
+
+ if (init_new_context(tsk,mm))
+ goto fail_nocontext;
+
+ retval = dup_mmap(mm, oldmm);
+ if (retval)
+ goto free_pt;
+
+ mm->hiwater_rss = get_mm_counter(mm,rss);
+ mm->hiwater_vm = mm->total_vm;
+
+good_mm:
+ tsk->mm = mm;
+ tsk->active_mm = mm;
+ return 0;
+
+free_pt:
+ mmput(mm);
+fail_nomem:
+ return retval;
+
+fail_nocontext:
+ /*
+ * If init_new_context() failed, we cannot use mmput() to free the mm
+ * because it calls destroy_context()
+ */
+ mm_free_pgd(mm);
+ free_mm(mm);
+ return retval;
+}
+
+static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
+{
+ struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+ /* We don't need to lock fs - think why ;-) */
+ if (fs) {
+ atomic_set(&fs->count, 1);
+ rwlock_init(&fs->lock);
+ fs->umask = old->umask;
+ read_lock(&old->lock);
+ fs->rootmnt = mntget(old->rootmnt);
+ fs->root = dget(old->root);
+ fs->pwdmnt = mntget(old->pwdmnt);
+ fs->pwd = dget(old->pwd);
+ if (old->altroot) {
+ fs->altrootmnt = mntget(old->altrootmnt);
+ fs->altroot = dget(old->altroot);
+ } else {
+ fs->altrootmnt = NULL;
+ fs->altroot = NULL;
+ }
+ read_unlock(&old->lock);
+ }
+ return fs;
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+ return __copy_fs_struct(old);
+}
+
+EXPORT_SYMBOL_GPL(copy_fs_struct);
+
+static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
+{
+ if (clone_flags & CLONE_FS) {
+ atomic_inc(&current->fs->count);
+ return 0;
+ }
+ tsk->fs = __copy_fs_struct(current->fs);
+ if (!tsk->fs)
+ return -ENOMEM;
+ return 0;
+}
+
+static int count_open_files(struct files_struct *files, int size)
+{
+ int i;
+
+ /* Find the last open fd */
+ for (i = size/(8*sizeof(long)); i > 0; ) {
+ if (files->open_fds->fds_bits[--i])
+ break;
+ }
+ i = (i+1) * 8 * sizeof(long);
+ return i;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct files_struct *oldf, *newf;
+ struct file **old_fds, **new_fds;
+ int open_files, size, i, error = 0, expand;
+
+ /*
+ * A background process may not have any files ...
+ */
+ oldf = current->files;
+ if (!oldf)
+ goto out;
+
+ if (clone_flags & CLONE_FILES) {
+ atomic_inc(&oldf->count);
+ goto out;
+ }
+
+ /*
+ * Note: we may be using current for both targets (See exec.c)
+ * This works because we cache current->files (old) as oldf. Don't
+ * break this.
+ */
+ tsk->files = NULL;
+ error = -ENOMEM;
+ newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+ if (!newf)
+ goto out;
+
+ atomic_set(&newf->count, 1);
+
+ spin_lock_init(&newf->file_lock);
+ newf->next_fd = 0;
+ newf->max_fds = NR_OPEN_DEFAULT;
+ newf->max_fdset = __FD_SETSIZE;
+ newf->close_on_exec = &newf->close_on_exec_init;
+ newf->open_fds = &newf->open_fds_init;
+ newf->fd = &newf->fd_array[0];
+
+ spin_lock(&oldf->file_lock);
+
+ open_files = count_open_files(oldf, oldf->max_fdset);
+ expand = 0;
+
+ /*
+ * Check whether we need to allocate a larger fd array or fd set.
+ * Note: we're not a clone task, so the open count won't change.
+ */
+ if (open_files > newf->max_fdset) {
+ newf->max_fdset = 0;
+ expand = 1;
+ }
+ if (open_files > newf->max_fds) {
+ newf->max_fds = 0;
+ expand = 1;
+ }
+
+ /* if the old fdset gets grown now, we'll only copy up to "size" fds */
+ if (expand) {
+ spin_unlock(&oldf->file_lock);
+ spin_lock(&newf->file_lock);
+ error = expand_files(newf, open_files-1);
+ spin_unlock(&newf->file_lock);
+ if (error < 0)
+ goto out_release;
+ spin_lock(&oldf->file_lock);
+ }
+
+ old_fds = oldf->fd;
+ new_fds = newf->fd;
+
+ memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
+ memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+
+ for (i = open_files; i != 0; i--) {
+ struct file *f = *old_fds++;
+ if (f) {
+ get_file(f);
+ } else {
+ /*
+ * The fd may be claimed in the fd bitmap but not yet
+ * instantiated in the files array if a sibling thread
+ * is partway through open(). So make sure that this
+ * fd is available to the new process.
+ */
+ FD_CLR(open_files - i, newf->open_fds);
+ }
+ *new_fds++ = f;
+ }
+ spin_unlock(&oldf->file_lock);
+
+ /* compute the remainder to be cleared */
+ size = (newf->max_fds - open_files) * sizeof(struct file *);
+
+ /* This is long word aligned thus could use a optimized version */
+ memset(new_fds, 0, size);
+
+ if (newf->max_fdset > open_files) {
+ int left = (newf->max_fdset-open_files)/8;
+ int start = open_files / (8 * sizeof(unsigned long));
+
+ memset(&newf->open_fds->fds_bits[start], 0, left);
+ memset(&newf->close_on_exec->fds_bits[start], 0, left);
+ }
+
+ tsk->files = newf;
+ error = 0;
+out:
+ return error;
+
+out_release:
+ free_fdset (newf->close_on_exec, newf->max_fdset);
+ free_fdset (newf->open_fds, newf->max_fdset);
+ free_fd_array(newf->fd, newf->max_fds);
+ kmem_cache_free(files_cachep, newf);
+ goto out;
+}
+
+/*
+ * Helper to unshare the files of the current task.
+ * We don't want to expose copy_files internals to
+ * the exec layer of the kernel.
+ */
+
+int unshare_files(void)
+{
+ struct files_struct *files = current->files;
+ int rc;
+
+ if(!files)
+ BUG();
+
+ /* This can race but the race causes us to copy when we don't
+ need to and drop the copy */
+ if(atomic_read(&files->count) == 1)
+ {
+ atomic_inc(&files->count);
+ return 0;
+ }
+ rc = copy_files(0, current);
+ if(rc)
+ current->files = files;
+ return rc;
+}
+
+EXPORT_SYMBOL(unshare_files);
+
+static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct sighand_struct *sig;
+
+ if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+ atomic_inc(&current->sighand->count);
+ return 0;
+ }
+ sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+ tsk->sighand = sig;
+ if (!sig)
+ return -ENOMEM;
+ spin_lock_init(&sig->siglock);
+ atomic_set(&sig->count, 1);
+ memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ return 0;
+}
+
+static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct signal_struct *sig;
+ int ret;
+
+ if (clone_flags & CLONE_THREAD) {
+ atomic_inc(&current->signal->count);
+ atomic_inc(&current->signal->live);
+ return 0;
+ }
+ sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+ tsk->signal = sig;
+ if (!sig)
+ return -ENOMEM;
+
+ ret = copy_thread_group_keys(tsk);
+ if (ret < 0) {
+ kmem_cache_free(signal_cachep, sig);
+ return ret;
+ }
+
+ atomic_set(&sig->count, 1);
+ atomic_set(&sig->live, 1);
+ init_waitqueue_head(&sig->wait_chldexit);
+ sig->flags = 0;
+ sig->group_exit_code = 0;
+ sig->group_exit_task = NULL;
+ sig->group_stop_count = 0;
+ sig->curr_target = NULL;
+ init_sigpending(&sig->shared_pending);
+ INIT_LIST_HEAD(&sig->posix_timers);
+
+ sig->it_real_value = sig->it_real_incr = 0;
+ sig->real_timer.function = it_real_fn;
+ sig->real_timer.data = (unsigned long) tsk;
+ init_timer(&sig->real_timer);
+
+ sig->it_virt_expires = cputime_zero;
+ sig->it_virt_incr = cputime_zero;
+ sig->it_prof_expires = cputime_zero;
+ sig->it_prof_incr = cputime_zero;
+
+ sig->tty = current->signal->tty;
+ sig->pgrp = process_group(current);
+ sig->session = current->signal->session;
+ sig->leader = 0; /* session leadership doesn't inherit */
+ sig->tty_old_pgrp = 0;
+
+ sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+ sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+ sig->sched_time = 0;
+ INIT_LIST_HEAD(&sig->cpu_timers[0]);
+ INIT_LIST_HEAD(&sig->cpu_timers[1]);
+ INIT_LIST_HEAD(&sig->cpu_timers[2]);
+
+ task_lock(current->group_leader);
+ memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+ task_unlock(current->group_leader);
+
+ if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+ /*
+ * New sole thread in the process gets an expiry time
+ * of the whole CPU time limit.
+ */
+ tsk->it_prof_expires =
+ secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+ }
+
+ return 0;
+}
+
+static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
+{
+ unsigned long new_flags = p->flags;
+
+ new_flags &= ~PF_SUPERPRIV;
+ new_flags |= PF_FORKNOEXEC;
+ if (!(clone_flags & CLONE_PTRACE))
+ p->ptrace = 0;
+ p->flags = new_flags;
+}
+
+asmlinkage long sys_set_tid_address(int __user *tidptr)
+{
+ current->clear_child_tid = tidptr;
+
+ return current->pid;
+}
+
+/*
+ * This creates a new process as a copy of the old one,
+ * but does not actually start it yet.
+ *
+ * It copies the registers, and all the appropriate
+ * parts of the process environment (as per the clone
+ * flags). The actual kick-off is left to the caller.
+ */
+static task_t *copy_process(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr,
+ int pid)
+{
+ int retval;
+ struct task_struct *p = NULL;
+
+ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Thread groups must share signals as well, and detached threads
+ * can only be started up within the thread group.
+ */
+ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Shared signal handlers imply shared VM. By way of the above,
+ * thread groups also imply shared VM. Blocking this case allows
+ * for various simplifications in other code.
+ */
+ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
+ return ERR_PTR(-EINVAL);
+
+ retval = security_task_create(clone_flags);
+ if (retval)
+ goto fork_out;
+
+ retval = -ENOMEM;
+ p = dup_task_struct(current);
+ if (!p)
+ goto fork_out;
+
+ retval = -EAGAIN;
+ if (atomic_read(&p->user->processes) >=
+ p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
+ p->user != &root_user)
+ goto bad_fork_free;
+ }
+
+ atomic_inc(&p->user->__count);
+ atomic_inc(&p->user->processes);
+ get_group_info(p->group_info);
+
+ /*
+ * If multiple threads are within copy_process(), then this check
+ * triggers too late. This doesn't hurt, the check is only there
+ * to stop root fork bombs.
+ */
+ if (nr_threads >= max_threads)
+ goto bad_fork_cleanup_count;
+
+ if (!try_module_get(p->thread_info->exec_domain->module))
+ goto bad_fork_cleanup_count;
+
+ if (p->binfmt && !try_module_get(p->binfmt->module))
+ goto bad_fork_cleanup_put_domain;
+
+ p->did_exec = 0;
+ copy_flags(clone_flags, p);
+ p->pid = pid;
+ retval = -EFAULT;
+ if (clone_flags & CLONE_PARENT_SETTID)
+ if (put_user(p->pid, parent_tidptr))
+ goto bad_fork_cleanup;
+
+ p->proc_dentry = NULL;
+
+ INIT_LIST_HEAD(&p->children);
+ INIT_LIST_HEAD(&p->sibling);
+ p->vfork_done = NULL;
+ spin_lock_init(&p->alloc_lock);
+ spin_lock_init(&p->proc_lock);
+
+ clear_tsk_thread_flag(p, TIF_SIGPENDING);
+ init_sigpending(&p->pending);
+
+ p->utime = cputime_zero;
+ p->stime = cputime_zero;
+ p->sched_time = 0;
+ p->rchar = 0; /* I/O counter: bytes read */
+ p->wchar = 0; /* I/O counter: bytes written */
+ p->syscr = 0; /* I/O counter: read syscalls */
+ p->syscw = 0; /* I/O counter: write syscalls */
+ acct_clear_integrals(p);
+
+ p->it_virt_expires = cputime_zero;
+ p->it_prof_expires = cputime_zero;
+ p->it_sched_expires = 0;
+ INIT_LIST_HEAD(&p->cpu_timers[0]);
+ INIT_LIST_HEAD(&p->cpu_timers[1]);
+ INIT_LIST_HEAD(&p->cpu_timers[2]);
+
+ p->lock_depth = -1; /* -1 = no lock */
+ do_posix_clock_monotonic_gettime(&p->start_time);
+ p->security = NULL;
+ p->io_context = NULL;
+ p->io_wait = NULL;
+ p->audit_context = NULL;
+#ifdef CONFIG_NUMA
+ p->mempolicy = mpol_copy(p->mempolicy);
+ if (IS_ERR(p->mempolicy)) {
+ retval = PTR_ERR(p->mempolicy);
+ p->mempolicy = NULL;
+ goto bad_fork_cleanup;
+ }
+#endif
+
+ p->tgid = p->pid;
+ if (clone_flags & CLONE_THREAD)
+ p->tgid = current->tgid;
+
+ if ((retval = security_task_alloc(p)))
+ goto bad_fork_cleanup_policy;
+ if ((retval = audit_alloc(p)))
+ goto bad_fork_cleanup_security;
+ /* copy all the process information */
+ if ((retval = copy_semundo(clone_flags, p)))
+ goto bad_fork_cleanup_audit;
+ if ((retval = copy_files(clone_flags, p)))
+ goto bad_fork_cleanup_semundo;
+ if ((retval = copy_fs(clone_flags, p)))
+ goto bad_fork_cleanup_files;
+ if ((retval = copy_sighand(clone_flags, p)))
+ goto bad_fork_cleanup_fs;
+ if ((retval = copy_signal(clone_flags, p)))
+ goto bad_fork_cleanup_sighand;
+ if ((retval = copy_mm(clone_flags, p)))
+ goto bad_fork_cleanup_signal;
+ if ((retval = copy_keys(clone_flags, p)))
+ goto bad_fork_cleanup_mm;
+ if ((retval = copy_namespace(clone_flags, p)))
+ goto bad_fork_cleanup_keys;
+ retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+ if (retval)
+ goto bad_fork_cleanup_namespace;
+
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ /*
+ * Clear TID on mm_release()?
+ */
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+
+ /*
+ * Syscall tracing should be turned off in the child regardless
+ * of CLONE_PTRACE.
+ */
+ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+
+ /* Our parent execution domain becomes current domain
+ These must match for thread signalling to apply */
+
+ p->parent_exec_id = p->self_exec_id;
+
+ /* ok, now we should be set up.. */
+ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+ p->pdeath_signal = 0;
+ p->exit_state = 0;
+
+ /* Perform scheduler related setup */
+ sched_fork(p);
+
+ /*
+ * Ok, make it visible to the rest of the system.
+ * We dont wake it up yet.
+ */
+ p->group_leader = p;
+ INIT_LIST_HEAD(&p->ptrace_children);
+ INIT_LIST_HEAD(&p->ptrace_list);
+
+ /* Need tasklist lock for parent etc handling! */
+ write_lock_irq(&tasklist_lock);
+
+ /*
+ * The task hasn't been attached yet, so cpus_allowed mask cannot
+ * have changed. The cpus_allowed mask of the parent may have
+ * changed after it was copied first time, and it may then move to
+ * another CPU - so we re-copy it here and set the child's CPU to
+ * the parent's CPU. This avoids alot of nasty races.
+ */
+ p->cpus_allowed = current->cpus_allowed;
+ set_task_cpu(p, smp_processor_id());
+
+ /*
+ * Check for pending SIGKILL! The new thread should not be allowed
+ * to slip out of an OOM kill. (or normal SIGKILL.)
+ */
+ if (sigismember(&current->pending.signal, SIGKILL)) {
+ write_unlock_irq(&tasklist_lock);
+ retval = -EINTR;
+ goto bad_fork_cleanup_namespace;
+ }
+
+ /* CLONE_PARENT re-uses the old parent */
+ if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+ p->real_parent = current->real_parent;
+ else
+ p->real_parent = current;
+ p->parent = p->real_parent;
+
+ if (clone_flags & CLONE_THREAD) {
+ spin_lock(&current->sighand->siglock);
+ /*
+ * Important: if an exit-all has been started then
+ * do not create this new thread - the whole thread
+ * group is supposed to exit anyway.
+ */
+ if (current->signal->flags & SIGNAL_GROUP_EXIT) {
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
+ retval = -EAGAIN;
+ goto bad_fork_cleanup_namespace;
+ }
+ p->group_leader = current->group_leader;
+
+ if (current->signal->group_stop_count > 0) {
+ /*
+ * There is an all-stop in progress for the group.
+ * We ourselves will stop as soon as we check signals.
+ * Make the new thread part of that group stop too.
+ */
+ current->signal->group_stop_count++;
+ set_tsk_thread_flag(p, TIF_SIGPENDING);
+ }
+
+ if (!cputime_eq(current->signal->it_virt_expires,
+ cputime_zero) ||
+ !cputime_eq(current->signal->it_prof_expires,
+ cputime_zero) ||
+ current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
+ !list_empty(&current->signal->cpu_timers[0]) ||
+ !list_empty(&current->signal->cpu_timers[1]) ||
+ !list_empty(&current->signal->cpu_timers[2])) {
+ /*
+ * Have child wake up on its first tick to check
+ * for process CPU timers.
+ */
+ p->it_prof_expires = jiffies_to_cputime(1);
+ }
+
+ spin_unlock(&current->sighand->siglock);
+ }
+
+ SET_LINKS(p);
+ if (unlikely(p->ptrace & PT_PTRACED))
+ __ptrace_link(p, current->parent);
+
+ cpuset_fork(p);
+
+ attach_pid(p, PIDTYPE_PID, p->pid);
+ attach_pid(p, PIDTYPE_TGID, p->tgid);
+ if (thread_group_leader(p)) {
+ attach_pid(p, PIDTYPE_PGID, process_group(p));
+ attach_pid(p, PIDTYPE_SID, p->signal->session);
+ if (p->pid)
+ __get_cpu_var(process_counts)++;
+ }
+
+ nr_threads++;
+ total_forks++;
+ write_unlock_irq(&tasklist_lock);
+ retval = 0;
+
+fork_out:
+ if (retval)
+ return ERR_PTR(retval);
+ return p;
+
+bad_fork_cleanup_namespace:
+ exit_namespace(p);
+bad_fork_cleanup_keys:
+ exit_keys(p);
+bad_fork_cleanup_mm:
+ if (p->mm)
+ mmput(p->mm);
+bad_fork_cleanup_signal:
+ exit_signal(p);
+bad_fork_cleanup_sighand:
+ exit_sighand(p);
+bad_fork_cleanup_fs:
+ exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+ exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+ exit_sem(p);
+bad_fork_cleanup_audit:
+ audit_free(p);
+bad_fork_cleanup_security:
+ security_task_free(p);
+bad_fork_cleanup_policy:
+#ifdef CONFIG_NUMA
+ mpol_free(p->mempolicy);
+#endif
+bad_fork_cleanup:
+ if (p->binfmt)
+ module_put(p->binfmt->module);
+bad_fork_cleanup_put_domain:
+ module_put(p->thread_info->exec_domain->module);
+bad_fork_cleanup_count:
+ put_group_info(p->group_info);
+ atomic_dec(&p->user->processes);
+ free_uid(p->user);
+bad_fork_free:
+ free_task(p);
+ goto fork_out;
+}
+
+struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+{
+ memset(regs, 0, sizeof(struct pt_regs));
+ return regs;
+}
+
+task_t * __devinit fork_idle(int cpu)
+{
+ task_t *task;
+ struct pt_regs regs;
+
+ task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
+ if (!task)
+ return ERR_PTR(-ENOMEM);
+ init_idle(task, cpu);
+ unhash_process(task);
+ return task;
+}
+
+static inline int fork_traceflag (unsigned clone_flags)
+{
+ if (clone_flags & CLONE_UNTRACED)
+ return 0;
+ else if (clone_flags & CLONE_VFORK) {
+ if (current->ptrace & PT_TRACE_VFORK)
+ return PTRACE_EVENT_VFORK;
+ } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
+ if (current->ptrace & PT_TRACE_CLONE)
+ return PTRACE_EVENT_CLONE;
+ } else if (current->ptrace & PT_TRACE_FORK)
+ return PTRACE_EVENT_FORK;
+
+ return 0;
+}
+
+/*
+ * Ok, this is the main fork-routine.
+ *
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ struct task_struct *p;
+ int trace = 0;
+ long pid = alloc_pidmap();
+
+ if (pid < 0)
+ return -EAGAIN;
+ if (unlikely(current->ptrace)) {
+ trace = fork_traceflag (clone_flags);
+ if (trace)
+ clone_flags |= CLONE_PTRACE;
+ }
+
+ p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+ /*
+ * Do this prior waking up the new thread - the thread pointer
+ * might get invalid after that point, if the thread exits quickly.
+ */
+ if (!IS_ERR(p)) {
+ struct completion vfork;
+
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+ }
+
+ if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+ /*
+ * We'll start up with an immediate SIGSTOP.
+ */
+ sigaddset(&p->pending.signal, SIGSTOP);
+ set_tsk_thread_flag(p, TIF_SIGPENDING);
+ }
+
+ if (!(clone_flags & CLONE_STOPPED))
+ wake_up_new_task(p, clone_flags);
+ else
+ p->state = TASK_STOPPED;
+
+ if (unlikely (trace)) {
+ current->ptrace_message = pid;
+ ptrace_notify ((trace << 8) | SIGTRAP);
+ }
+
+ if (clone_flags & CLONE_VFORK) {
+ wait_for_completion(&vfork);
+ if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+ ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+ }
+ } else {
+ free_pidmap(pid);
+ pid = PTR_ERR(p);
+ }
+ return pid;
+}
+
+void __init proc_caches_init(void)
+{
+ sighand_cachep = kmem_cache_create("sighand_cache",
+ sizeof(struct sighand_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+ signal_cachep = kmem_cache_create("signal_cache",
+ sizeof(struct signal_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+ files_cachep = kmem_cache_create("files_cache",
+ sizeof(struct files_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+ fs_cachep = kmem_cache_create("fs_cache",
+ sizeof(struct fs_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), 0,
+ SLAB_PANIC, NULL, NULL);
+ mm_cachep = kmem_cache_create("mm_struct",
+ sizeof(struct mm_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
diff --git a/kernel/futex.c b/kernel/futex.c
new file mode 100644
index 000000000000..7b54a672d0ad
--- /dev/null
+++ b/kernel/futex.c
@@ -0,0 +1,798 @@
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/futex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+
+#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+
+/*
+ * Futexes are matched on equal values of this key.
+ * The key type depends on whether it's a shared or private mapping.
+ * Don't rearrange members without looking at hash_futex().
+ *
+ * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
+ * We set bit 0 to indicate if it's an inode-based key.
+ */
+union futex_key {
+ struct {
+ unsigned long pgoff;
+ struct inode *inode;
+ int offset;
+ } shared;
+ struct {
+ unsigned long uaddr;
+ struct mm_struct *mm;
+ int offset;
+ } private;
+ struct {
+ unsigned long word;
+ void *ptr;
+ int offset;
+ } both;
+};
+
+/*
+ * We use this hashed waitqueue instead of a normal wait_queue_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakup is always to make the first condition true, then
+ * wake up q->waiters, then make the second condition true.
+ */
+struct futex_q {
+ struct list_head list;
+ wait_queue_head_t waiters;
+
+ /* Which hash list lock to use. */
+ spinlock_t *lock_ptr;
+
+ /* Key which the futex is hashed on. */
+ union futex_key key;
+
+ /* For fd, sigio sent using these. */
+ int fd;
+ struct file *filp;
+};
+
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+ spinlock_t lock;
+ struct list_head chain;
+};
+
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+
+/* Futex-fs vfsmount entry: */
+static struct vfsmount *futex_mnt;
+
+/*
+ * We hash on the keys returned from get_futex_key (see below).
+ */
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
+{
+ u32 hash = jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset);
+ return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+}
+
+/*
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int match_futex(union futex_key *key1, union futex_key *key2)
+{
+ return (key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+/*
+ * Get parameters which are the keys for a futex.
+ *
+ * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * offset_within_page). For private mappings, it's (uaddr, current->mm).
+ * We can usually work out the index without swapping in the page.
+ *
+ * Returns: 0, or negative error code.
+ * The key words are stored in *key on success.
+ *
+ * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
+ */
+static int get_futex_key(unsigned long uaddr, union futex_key *key)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ int err;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = uaddr % PAGE_SIZE;
+ if (unlikely((key->both.offset % sizeof(u32)) != 0))
+ return -EINVAL;
+ uaddr -= key->both.offset;
+
+ /*
+ * The futex is hashed differently depending on whether
+ * it's in a shared or private mapping. So check vma first.
+ */
+ vma = find_extend_vma(mm, uaddr);
+ if (unlikely(!vma))
+ return -EFAULT;
+
+ /*
+ * Permissions.
+ */
+ if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+ return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process. Therefore we use
+ * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
+ * mappings of _writable_ handles.
+ */
+ if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->private.mm = mm;
+ key->private.uaddr = uaddr;
+ return 0;
+ }
+
+ /*
+ * Linear file mappings are also simple.
+ */
+ key->shared.inode = vma->vm_file->f_dentry->d_inode;
+ key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+ if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
+ key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff);
+ return 0;
+ }
+
+ /*
+ * We could walk the page table to read the non-linear
+ * pte, and get the page index without fetching the page
+ * from swap. But that's a lot of code to duplicate here
+ * for a rare case, so we simply fetch the page.
+ */
+
+ /*
+ * Do a quick atomic lookup first - this is the fastpath.
+ */
+ spin_lock(&current->mm->page_table_lock);
+ page = follow_page(mm, uaddr, 0);
+ if (likely(page != NULL)) {
+ key->shared.pgoff =
+ page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ spin_unlock(&current->mm->page_table_lock);
+ return 0;
+ }
+ spin_unlock(&current->mm->page_table_lock);
+
+ /*
+ * Do it the general way.
+ */
+ err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+ if (err >= 0) {
+ key->shared.pgoff =
+ page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ put_page(page);
+ return 0;
+ }
+ return err;
+}
+
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
+ * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
+ */
+static inline void get_key_refs(union futex_key *key)
+{
+ if (key->both.ptr != 0) {
+ if (key->both.offset & 1)
+ atomic_inc(&key->shared.inode->i_count);
+ else
+ atomic_inc(&key->private.mm->mm_count);
+ }
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_key_refs(union futex_key *key)
+{
+ if (key->both.ptr != 0) {
+ if (key->both.offset & 1)
+ iput(key->shared.inode);
+ else
+ mmdrop(key->private.mm);
+ }
+}
+
+static inline int get_futex_value_locked(int *dest, int __user *from)
+{
+ int ret;
+
+ inc_preempt_count();
+ ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+ dec_preempt_count();
+
+ return ret ? -EFAULT : 0;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed.
+ */
+static void wake_futex(struct futex_q *q)
+{
+ list_del_init(&q->list);
+ if (q->filp)
+ send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
+ /*
+ * The lock in wake_up_all() is a crucial memory barrier after the
+ * list_del_init() and also before assigning to q->lock_ptr.
+ */
+ wake_up_all(&q->waiters);
+ /*
+ * The waiting task can free the futex_q as soon as this is written,
+ * without taking any locks. This must come last.
+ */
+ q->lock_ptr = NULL;
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake(unsigned long uaddr, int nr_wake)
+{
+ union futex_key key;
+ struct futex_hash_bucket *bh;
+ struct list_head *head;
+ struct futex_q *this, *next;
+ int ret;
+
+ down_read(&current->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr, &key);
+ if (unlikely(ret != 0))
+ goto out;
+
+ bh = hash_futex(&key);
+ spin_lock(&bh->lock);
+ head = &bh->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key)) {
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&bh->lock);
+out:
+ up_read(&current->mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Requeue all waiters hashed on one physical page to another
+ * physical page.
+ */
+static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
+ int nr_wake, int nr_requeue, int *valp)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *bh1, *bh2;
+ struct list_head *head1;
+ struct futex_q *this, *next;
+ int ret, drop_count = 0;
+
+ retry:
+ down_read(&current->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr1, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+
+ if (bh1 < bh2)
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+
+ if (likely(valp != NULL)) {
+ int curval;
+
+ ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+
+ /* If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ up_read(&current->mm->mmap_sem);
+
+ ret = get_user(curval, (int __user *)uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *valp) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &bh1->chain;
+ list_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ list_move_tail(&this->list, &bh2->chain);
+ this->lock_ptr = &bh2->lock;
+ this->key = key2;
+ get_key_refs(&key2);
+ drop_count++;
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ /* Make sure to stop if key1 == key2 */
+ if (head1 == &bh2->chain && head1 != &next->list)
+ head1 = &this->list;
+ }
+ }
+
+out_unlock:
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+
+ /* drop_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_key_refs(&key1);
+
+out:
+ up_read(&current->mm->mmap_sem);
+ return ret;
+}
+
+/* The key must be already stored in q->key. */
+static inline struct futex_hash_bucket *
+queue_lock(struct futex_q *q, int fd, struct file *filp)
+{
+ struct futex_hash_bucket *bh;
+
+ q->fd = fd;
+ q->filp = filp;
+
+ init_waitqueue_head(&q->waiters);
+
+ get_key_refs(&q->key);
+ bh = hash_futex(&q->key);
+ q->lock_ptr = &bh->lock;
+
+ spin_lock(&bh->lock);
+ return bh;
+}
+
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+ list_add_tail(&q->list, &bh->chain);
+ spin_unlock(&bh->lock);
+}
+
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+ spin_unlock(&bh->lock);
+ drop_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once. They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static void queue_me(struct futex_q *q, int fd, struct file *filp)
+{
+ struct futex_hash_bucket *bh;
+ bh = queue_lock(q, fd, filp);
+ __queue_me(q, bh);
+}
+
+/* Return 1 if we were still queued (ie. 0 means we were woken) */
+static int unqueue_me(struct futex_q *q)
+{
+ int ret = 0;
+ spinlock_t *lock_ptr;
+
+ /* In the common case we don't take the spinlock, which is nice. */
+ retry:
+ lock_ptr = q->lock_ptr;
+ if (lock_ptr != 0) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ WARN_ON(list_empty(&q->list));
+ list_del(&q->list);
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ drop_key_refs(&q->key);
+ return ret;
+}
+
+static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ int ret, curval;
+ struct futex_q q;
+ struct futex_hash_bucket *bh;
+
+ retry:
+ down_read(&current->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr, &q.key);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ bh = queue_lock(&q, -1, NULL);
+
+ /*
+ * Access the page AFTER the futex is queued.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we queued after testing *uaddr, that would open
+ * a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * A consequence is that futex_wait() can return zero and absorb
+ * a wakeup when *uaddr != val on entry to the syscall. This is
+ * rare, but normal.
+ *
+ * We hold the mmap semaphore, so the mapping cannot have changed
+ * since we looked it up in get_futex_key.
+ */
+
+ ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+
+ if (unlikely(ret)) {
+ queue_unlock(&q, bh);
+
+ /* If we would have faulted, release mmap_sem, fault it in and
+ * start all over again.
+ */
+ up_read(&current->mm->mmap_sem);
+
+ ret = get_user(curval, (int __user *)uaddr);
+
+ if (!ret)
+ goto retry;
+ return ret;
+ }
+ if (curval != val) {
+ ret = -EWOULDBLOCK;
+ queue_unlock(&q, bh);
+ goto out_release_sem;
+ }
+
+ /* Only actually queue if *uaddr contained val. */
+ __queue_me(&q, bh);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ up_read(&current->mm->mmap_sem);
+
+ /*
+ * There might have been scheduling since the queue_me(), as we
+ * cannot hold a spinlock across the get_user() in case it
+ * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+ * queueing ourselves into the futex hash. This code thus has to
+ * rely on the futex_wake() code removing us from hash when it
+ * wakes us up.
+ */
+
+ /* add_wait_queue is the barrier after __set_current_state. */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&q.waiters, &wait);
+ /*
+ * !list_empty() is safe here without any lock.
+ * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+ */
+ if (likely(!list_empty(&q.list)))
+ time = schedule_timeout(time);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * NOTE: we don't remove ourselves from the waitqueue because
+ * we are the only user of it.
+ */
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ if (!unqueue_me(&q))
+ return 0;
+ if (time == 0)
+ return -ETIMEDOUT;
+ /* We expect signal_pending(current), but another thread may
+ * have handled it for us already. */
+ return -EINTR;
+
+ out_release_sem:
+ up_read(&current->mm->mmap_sem);
+ return ret;
+}
+
+static int futex_close(struct inode *inode, struct file *filp)
+{
+ struct futex_q *q = filp->private_data;
+
+ unqueue_me(q);
+ kfree(q);
+ return 0;
+}
+
+/* This is one-shot: once it's gone off you need a new fd */
+static unsigned int futex_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct futex_q *q = filp->private_data;
+ int ret = 0;
+
+ poll_wait(filp, &q->waiters, wait);
+
+ /*
+ * list_empty() is safe here without any lock.
+ * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
+ */
+ if (list_empty(&q->list))
+ ret = POLLIN | POLLRDNORM;
+
+ return ret;
+}
+
+static struct file_operations futex_fops = {
+ .release = futex_close,
+ .poll = futex_poll,
+};
+
+/*
+ * Signal allows caller to avoid the race which would occur if they
+ * set the sigio stuff up afterwards.
+ */
+static int futex_fd(unsigned long uaddr, int signal)
+{
+ struct futex_q *q;
+ struct file *filp;
+ int ret, err;
+
+ ret = -EINVAL;
+ if (signal < 0 || signal > _NSIG)
+ goto out;
+
+ ret = get_unused_fd();
+ if (ret < 0)
+ goto out;
+ filp = get_empty_filp();
+ if (!filp) {
+ put_unused_fd(ret);
+ ret = -ENFILE;
+ goto out;
+ }
+ filp->f_op = &futex_fops;
+ filp->f_vfsmnt = mntget(futex_mnt);
+ filp->f_dentry = dget(futex_mnt->mnt_root);
+ filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+
+ if (signal) {
+ int err;
+ err = f_setown(filp, current->pid, 1);
+ if (err < 0) {
+ put_unused_fd(ret);
+ put_filp(filp);
+ ret = err;
+ goto out;
+ }
+ filp->f_owner.signum = signal;
+ }
+
+ q = kmalloc(sizeof(*q), GFP_KERNEL);
+ if (!q) {
+ put_unused_fd(ret);
+ put_filp(filp);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_read(&current->mm->mmap_sem);
+ err = get_futex_key(uaddr, &q->key);
+
+ if (unlikely(err != 0)) {
+ up_read(&current->mm->mmap_sem);
+ put_unused_fd(ret);
+ put_filp(filp);
+ kfree(q);
+ return err;
+ }
+
+ /*
+ * queue_me() must be called before releasing mmap_sem, because
+ * key->shared.inode needs to be referenced while holding it.
+ */
+ filp->private_data = q;
+
+ queue_me(q, ret, filp);
+ up_read(&current->mm->mmap_sem);
+
+ /* Now we map fd to filp, so userspace can access it */
+ fd_install(ret, filp);
+out:
+ return ret;
+}
+
+long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
+ unsigned long uaddr2, int val2, int val3)
+{
+ int ret;
+
+ switch (op) {
+ case FUTEX_WAIT:
+ ret = futex_wait(uaddr, val, timeout);
+ break;
+ case FUTEX_WAKE:
+ ret = futex_wake(uaddr, val);
+ break;
+ case FUTEX_FD:
+ /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
+ ret = futex_fd(uaddr, val);
+ break;
+ case FUTEX_REQUEUE:
+ ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ break;
+ case FUTEX_CMP_REQUEUE:
+ ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+ return ret;
+}
+
+
+asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+ struct timespec __user *utime, u32 __user *uaddr2,
+ int val3)
+{
+ struct timespec t;
+ unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ int val2 = 0;
+
+ if ((op == FUTEX_WAIT) && utime) {
+ if (copy_from_user(&t, utime, sizeof(t)) != 0)
+ return -EFAULT;
+ timeout = timespec_to_jiffies(&t) + 1;
+ }
+ /*
+ * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+ */
+ if (op >= FUTEX_REQUEUE)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex((unsigned long)uaddr, op, val, timeout,
+ (unsigned long)uaddr2, val2, val3);
+}
+
+static struct super_block *
+futexfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+}
+
+static struct file_system_type futex_fs_type = {
+ .name = "futexfs",
+ .get_sb = futexfs_get_sb,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init init(void)
+{
+ unsigned int i;
+
+ register_filesystem(&futex_fs_type);
+ futex_mnt = kern_mount(&futex_fs_type);
+
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ INIT_LIST_HEAD(&futex_queues[i].chain);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+ return 0;
+}
+__initcall(init);
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
new file mode 100644
index 000000000000..388977f3e9b7
--- /dev/null
+++ b/kernel/intermodule.c
@@ -0,0 +1,182 @@
+/* Deprecated, do not use. Moved from module.c to here. --RR */
+
+/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/* inter_module functions are always available, even when the kernel is
+ * compiled without modules. Consumers of inter_module_xxx routines
+ * will always work, even when both are built into the kernel, this
+ * approach removes lots of #ifdefs in mainline code.
+ */
+
+static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
+static DEFINE_SPINLOCK(ime_lock);
+static int kmalloc_failed;
+
+struct inter_module_entry {
+ struct list_head list;
+ const char *im_name;
+ struct module *owner;
+ const void *userdata;
+};
+
+/**
+ * inter_module_register - register a new set of inter module data.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ * @owner: module that is registering the data, always use THIS_MODULE
+ * @userdata: pointer to arbitrary userdata to be registered
+ *
+ * Description: Check that the im_name has not already been registered,
+ * complain if it has. For new data, add it to the inter_module_entry
+ * list.
+ */
+void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime, *ime_new;
+
+ if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+ /* Overloaded kernel, not fatal */
+ printk(KERN_ERR
+ "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
+ im_name);
+ kmalloc_failed = 1;
+ return;
+ }
+ memset(ime_new, 0, sizeof(*ime_new));
+ ime_new->im_name = im_name;
+ ime_new->owner = owner;
+ ime_new->userdata = userdata;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ spin_unlock(&ime_lock);
+ kfree(ime_new);
+ /* Program logic error, fatal */
+ printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
+ BUG();
+ }
+ }
+ list_add(&(ime_new->list), &ime_list);
+ spin_unlock(&ime_lock);
+}
+
+/**
+ * inter_module_unregister - unregister a set of inter module data.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: Check that the im_name has been registered, complain if
+ * it has not. For existing data, remove it from the
+ * inter_module_entry list.
+ */
+void inter_module_unregister(const char *im_name)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ list_del(&(ime->list));
+ spin_unlock(&ime_lock);
+ kfree(ime);
+ return;
+ }
+ }
+ spin_unlock(&ime_lock);
+ if (kmalloc_failed) {
+ printk(KERN_ERR
+ "inter_module_unregister: no entry for '%s', "
+ "probably caused by previous kmalloc failure\n",
+ im_name);
+ return;
+ }
+ else {
+ /* Program logic error, fatal */
+ printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
+ BUG();
+ }
+}
+
+/**
+ * inter_module_get - return arbitrary userdata from another module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: If the im_name has not been registered, return NULL.
+ * Try to increment the use count on the owning module, if that fails
+ * then return NULL. Otherwise return the userdata.
+ */
+static const void *inter_module_get(const char *im_name)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime;
+ const void *result = NULL;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ if (try_module_get(ime->owner))
+ result = ime->userdata;
+ break;
+ }
+ }
+ spin_unlock(&ime_lock);
+ return(result);
+}
+
+/**
+ * inter_module_get_request - im get with automatic request_module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ * @modname: module that is expected to register im_name
+ *
+ * Description: If inter_module_get fails, do request_module then retry.
+ */
+const void *inter_module_get_request(const char *im_name, const char *modname)
+{
+ const void *result = inter_module_get(im_name);
+ if (!result) {
+ request_module("%s", modname);
+ result = inter_module_get(im_name);
+ }
+ return(result);
+}
+
+/**
+ * inter_module_put - release use of data from another module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: If the im_name has not been registered, complain,
+ * otherwise decrement the use count on the owning module.
+ */
+void inter_module_put(const char *im_name)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ if (ime->owner)
+ module_put(ime->owner);
+ spin_unlock(&ime_lock);
+ return;
+ }
+ }
+ spin_unlock(&ime_lock);
+ printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
+ BUG();
+}
+
+EXPORT_SYMBOL(inter_module_register);
+EXPORT_SYMBOL(inter_module_unregister);
+EXPORT_SYMBOL(inter_module_get_request);
+EXPORT_SYMBOL(inter_module_put);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000000..49378738ff5e
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,5 @@
+
+obj-y := handle.o manage.o spurious.o
+obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_PROC_FS) += proc.o
+
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000000..98d62d8efeaf
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,189 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DECLARE_MUTEX(probe_sem);
+
+/**
+ * probe_irq_on - begin an interrupt autodetect
+ *
+ * Commence probing for an interrupt. The interrupts are scanned
+ * and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+ unsigned long val, delay;
+ irq_desc_t *desc;
+ unsigned int i;
+
+ down(&probe_sem);
+ /*
+ * something may have generated an irq long ago and we want to
+ * flush such a longstanding irq before considering it as spurious.
+ */
+ for (i = NR_IRQS-1; i > 0; i--) {
+ desc = irq_desc + i;
+
+ spin_lock_irq(&desc->lock);
+ if (!irq_desc[i].action)
+ irq_desc[i].handler->startup(i);
+ spin_unlock_irq(&desc->lock);
+ }
+
+ /* Wait for longstanding interrupts to trigger. */
+ for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
+ /* about 20ms delay */ barrier();
+
+ /*
+ * enable any unassigned irqs
+ * (we must startup again here because if a longstanding irq
+ * happened in the previous stage, it may have masked itself)
+ */
+ for (i = NR_IRQS-1; i > 0; i--) {
+ desc = irq_desc + i;
+
+ spin_lock_irq(&desc->lock);
+ if (!desc->action) {
+ desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+ if (desc->handler->startup(i))
+ desc->status |= IRQ_PENDING;
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ /*
+ * Wait for spurious interrupts to trigger
+ */
+ for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
+ /* about 100ms delay */ barrier();
+
+ /*
+ * Now filter out any obviously spurious interrupts
+ */
+ val = 0;
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc_t *desc = irq_desc + i;
+ unsigned int status;
+
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ /* It triggered already - consider it spurious. */
+ if (!(status & IRQ_WAITING)) {
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->handler->shutdown(i);
+ } else
+ if (i < 32)
+ val |= 1 << i;
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ return val;
+}
+
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ * probe_irq_mask - scan a bitmap of interrupt lines
+ * @val: mask of interrupts to consider
+ *
+ * Scan the interrupt lines and return a bitmap of active
+ * autodetect interrupts. The interrupt probe logic state
+ * is then returned to its previous value.
+ *
+ * Note: we need to scan all the irq's even though we will
+ * only return autodetect irq numbers - just so that we reset
+ * them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+ unsigned int mask;
+ int i;
+
+ mask = 0;
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc_t *desc = irq_desc + i;
+ unsigned int status;
+
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ if (i < 16 && !(status & IRQ_WAITING))
+ mask |= 1 << i;
+
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->handler->shutdown(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+ up(&probe_sem);
+
+ return mask & val;
+}
+EXPORT_SYMBOL(probe_irq_mask);
+
+/**
+ * probe_irq_off - end an interrupt autodetect
+ * @val: mask of potential interrupts (unused)
+ *
+ * Scans the unused interrupt lines and returns the line which
+ * appears to have triggered the interrupt. If no interrupt was
+ * found then zero is returned. If more than one interrupt is
+ * found then minus the first candidate is returned to indicate
+ * their is doubt.
+ *
+ * The interrupt probe logic state is returned to its previous
+ * value.
+ *
+ * BUGS: When used in a module (which arguably shouldn't happen)
+ * nothing prevents two IRQ probe callers from overlapping. The
+ * results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+ int i, irq_found = 0, nr_irqs = 0;
+
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc_t *desc = irq_desc + i;
+ unsigned int status;
+
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ if (!(status & IRQ_WAITING)) {
+ if (!nr_irqs)
+ irq_found = i;
+ nr_irqs++;
+ }
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->handler->shutdown(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+ up(&probe_sem);
+
+ if (nr_irqs > 1)
+ irq_found = -irq_found;
+ return irq_found;
+}
+
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000000..2fb0e46e11f3
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,193 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the core interrupt handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the apropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+ [0 ... NR_IRQS-1] = {
+ .handler = &no_irq_type,
+ .lock = SPIN_LOCK_UNLOCKED
+ }
+};
+
+/*
+ * Generic 'no controller' code
+ */
+static void end_none(unsigned int irq) { }
+static void enable_none(unsigned int irq) { }
+static void disable_none(unsigned int irq) { }
+static void shutdown_none(unsigned int irq) { }
+static unsigned int startup_none(unsigned int irq) { return 0; }
+
+static void ack_none(unsigned int irq)
+{
+ /*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themself.
+ */
+ ack_bad_irq(irq);
+}
+
+struct hw_interrupt_type no_irq_type = {
+ .typename = "none",
+ .startup = startup_none,
+ .shutdown = shutdown_none,
+ .enable = enable_none,
+ .disable = disable_none,
+ .ack = ack_none,
+ .end = end_none,
+ .set_affinity = NULL
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+{
+ return IRQ_NONE;
+}
+
+/*
+ * Have got an event to handle:
+ */
+fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+ struct irqaction *action)
+{
+ int ret, retval = 0, status = 0;
+
+ if (!(action->flags & SA_INTERRUPT))
+ local_irq_enable();
+
+ do {
+ ret = action->handler(irq, action->dev_id, regs);
+ if (ret == IRQ_HANDLED)
+ status |= action->flags;
+ retval |= ret;
+ action = action->next;
+ } while (action);
+
+ if (status & SA_SAMPLE_RANDOM)
+ add_interrupt_randomness(irq);
+ local_irq_disable();
+
+ return retval;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ struct irqaction * action;
+ unsigned int status;
+
+ kstat_this_cpu.irqs[irq]++;
+ if (desc->status & IRQ_PER_CPU) {
+ irqreturn_t action_ret;
+
+ /*
+ * No locking required for CPU-local interrupts:
+ */
+ desc->handler->ack(irq);
+ action_ret = handle_IRQ_event(irq, regs, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+ desc->handler->end(irq);
+ return 1;
+ }
+
+ spin_lock(&desc->lock);
+ desc->handler->ack(irq);
+ /*
+ * REPLAY is when Linux resends an IRQ that was dropped earlier
+ * WAITING is used by probe to mark irqs that are being tested
+ */
+ status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+ status |= IRQ_PENDING; /* we _want_ to handle it */
+
+ /*
+ * If the IRQ is disabled for whatever reason, we cannot
+ * use the action we have.
+ */
+ action = NULL;
+ if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+ action = desc->action;
+ status &= ~IRQ_PENDING; /* we commit to handling */
+ status |= IRQ_INPROGRESS; /* we are handling it */
+ }
+ desc->status = status;
+
+ /*
+ * If there is no IRQ handler or it was disabled, exit early.
+ * Since we set PENDING, if another processor is handling
+ * a different instance of this same irq, the other processor
+ * will take care of it.
+ */
+ if (unlikely(!action))
+ goto out;
+
+ /*
+ * Edge triggered interrupts need to remember
+ * pending events.
+ * This applies to any hw interrupts that allow a second
+ * instance of the same irq to arrive while we are in do_IRQ
+ * or in the handler. But the code here only handles the _second_
+ * instance of the irq, not the third or fourth. So it is mostly
+ * useful for irq hardware that does not mask cleanly in an
+ * SMP environment.
+ */
+ for (;;) {
+ irqreturn_t action_ret;
+
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, regs, action);
+
+ spin_lock(&desc->lock);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+ if (likely(!(desc->status & IRQ_PENDING)))
+ break;
+ desc->status &= ~IRQ_PENDING;
+ }
+ desc->status &= ~IRQ_INPROGRESS;
+
+out:
+ /*
+ * The ->end() handler has to deal with interrupts which got
+ * disabled while the handler was running.
+ */
+ desc->handler->end(irq);
+ spin_unlock(&desc->lock);
+
+ return 1;
+}
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000000..46feba630266
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,18 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+#endif
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000000..5202e4c4a5b6
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,349 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+
+/**
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ while (desc->status & IRQ_INPROGRESS)
+ cpu_relax();
+}
+
+EXPORT_SYMBOL(synchronize_irq);
+
+#endif
+
+/**
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ *
+ * This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (!desc->depth++) {
+ desc->status |= IRQ_DISABLED;
+ desc->handler->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+ irq_desc_t *desc = irq_desc + irq;
+
+ disable_irq_nosync(irq);
+ if (desc->action)
+ synchronize_irq(irq);
+}
+
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
+ *
+ * Undoes the effect of one call to disable_irq(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ *
+ * This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ switch (desc->depth) {
+ case 0:
+ WARN_ON(1);
+ break;
+ case 1: {
+ unsigned int status = desc->status & ~IRQ_DISABLED;
+
+ desc->status = status;
+ if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+ desc->status = status | IRQ_REPLAY;
+ hw_resend_irq(desc->handler,irq);
+ }
+ desc->handler->enable(irq);
+ /* fall-through */
+ }
+ default:
+ desc->depth--;
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(enable_irq);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+ struct irqaction *action;
+
+ if (irq >= NR_IRQS)
+ return 0;
+
+ action = irq_desc[irq].action;
+ if (action)
+ if (irqflags & action->flags & SA_SHIRQ)
+ action = NULL;
+
+ return !action;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+int setup_irq(unsigned int irq, struct irqaction * new)
+{
+ struct irq_desc *desc = irq_desc + irq;
+ struct irqaction *old, **p;
+ unsigned long flags;
+ int shared = 0;
+
+ if (desc->handler == &no_irq_type)
+ return -ENOSYS;
+ /*
+ * Some drivers like serial.c use request_irq() heavily,
+ * so we have to be careful not to interfere with a
+ * running system.
+ */
+ if (new->flags & SA_SAMPLE_RANDOM) {
+ /*
+ * This function might sleep, we want to call it first,
+ * outside of the atomic block.
+ * Yes, this might clear the entropy pool if the wrong
+ * driver is attempted to be loaded, without actually
+ * installing a new handler, but is this really a problem,
+ * only the sysadmin is able to do this.
+ */
+ rand_initialize_irq(irq);
+ }
+
+ /*
+ * The following block of code has to be executed atomically
+ */
+ spin_lock_irqsave(&desc->lock,flags);
+ p = &desc->action;
+ if ((old = *p) != NULL) {
+ /* Can't share interrupts unless both agree to */
+ if (!(old->flags & new->flags & SA_SHIRQ)) {
+ spin_unlock_irqrestore(&desc->lock,flags);
+ return -EBUSY;
+ }
+
+ /* add new interrupt at end of irq queue */
+ do {
+ p = &old->next;
+ old = *p;
+ } while (old);
+ shared = 1;
+ }
+
+ *p = new;
+
+ if (!shared) {
+ desc->depth = 0;
+ desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
+ IRQ_WAITING | IRQ_INPROGRESS);
+ if (desc->handler->startup)
+ desc->handler->startup(irq);
+ else
+ desc->handler->enable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock,flags);
+
+ new->irq = irq;
+ register_irq_proc(irq);
+ new->dir = NULL;
+ register_handler_proc(irq, new);
+
+ return 0;
+}
+
+/**
+ * free_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc;
+ struct irqaction **p;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS)
+ return;
+
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock,flags);
+ p = &desc->action;
+ for (;;) {
+ struct irqaction * action = *p;
+
+ if (action) {
+ struct irqaction **pp = p;
+
+ p = &action->next;
+ if (action->dev_id != dev_id)
+ continue;
+
+ /* Found it - now remove it from the list of entries */
+ *pp = action->next;
+ if (!desc->action) {
+ desc->status |= IRQ_DISABLED;
+ if (desc->handler->shutdown)
+ desc->handler->shutdown(irq);
+ else
+ desc->handler->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock,flags);
+ unregister_handler_proc(irq, action);
+
+ /* Make sure it's not being used on another CPU */
+ synchronize_irq(irq);
+ kfree(action);
+ return;
+ }
+ printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+ spin_unlock_irqrestore(&desc->lock,flags);
+ return;
+ }
+}
+
+EXPORT_SYMBOL(free_irq);
+
+/**
+ * request_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. From the point this
+ * call is made your handler function may be invoked. Since
+ * your handler function must clear any interrupt the board
+ * raises, you must take care both to initialise your hardware
+ * and to set up the interrupt handler in the right order.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id
+ * as this is required when freeing the interrupt.
+ *
+ * Flags:
+ *
+ * SA_SHIRQ Interrupt is shared
+ * SA_INTERRUPT Disable local interrupts while processing
+ * SA_SAMPLE_RANDOM The interrupt can be used for entropy
+ *
+ */
+int request_irq(unsigned int irq,
+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
+ unsigned long irqflags, const char * devname, void *dev_id)
+{
+ struct irqaction * action;
+ int retval;
+
+ /*
+ * Sanity-check: shared interrupts must pass in a real dev-ID,
+ * otherwise we'll have trouble later trying to figure out
+ * which interrupt is which (messes up the interrupt freeing
+ * logic etc).
+ */
+ if ((irqflags & SA_SHIRQ) && !dev_id)
+ return -EINVAL;
+ if (irq >= NR_IRQS)
+ return -EINVAL;
+ if (!handler)
+ return -EINVAL;
+
+ action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags;
+ cpus_clear(action->mask);
+ action->name = devname;
+ action->next = NULL;
+ action->dev_id = dev_id;
+
+ retval = setup_irq(irq, action);
+ if (retval)
+ kfree(action);
+
+ return retval;
+}
+
+EXPORT_SYMBOL(request_irq);
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000000..85d08daa6600
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,159 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+
+static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+
+#ifdef CONFIG_SMP
+
+/*
+ * The /proc/irq/<irq>/smp_affinity values:
+ */
+static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
+
+void __attribute__((weak))
+proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+ irq_affinity[irq] = mask_val;
+ irq_desc[irq].handler->set_affinity(irq, mask_val);
+}
+
+static int irq_affinity_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+
+ if (count - len < 2)
+ return -EINVAL;
+ len += sprintf(page + len, "\n");
+ return len;
+}
+
+int no_irq_affinity;
+static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ unsigned int irq = (int)(long)data, full_count = count, err;
+ cpumask_t new_value, tmp;
+
+ if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
+ return -EIO;
+
+ err = cpumask_parse(buffer, count, new_value);
+ if (err)
+ return err;
+
+ /*
+ * Do not allow disabling IRQs completely - it's a too easy
+ * way to make the system unusable accidentally :-) At least
+ * one online CPU still has to be targeted.
+ */
+ cpus_and(tmp, new_value, cpu_online_map);
+ if (cpus_empty(tmp))
+ return -EINVAL;
+
+ proc_set_irq_affinity(irq, new_value);
+
+ return full_count;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+ struct irq_desc *desc = irq_desc + irq;
+ struct irqaction *action;
+
+ for (action = desc->action ; action; action = action->next)
+ if ((action != new_action) && action->name &&
+ !strcmp(new_action->name, action->name))
+ return 0;
+ return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ char name [MAX_NAMELEN];
+
+ if (!irq_dir[irq] || action->dir || !action->name ||
+ !name_unique(irq, action))
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+ /* create /proc/irq/1234/handler/ */
+ action->dir = proc_mkdir(name, irq_dir[irq]);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq)
+{
+ char name [MAX_NAMELEN];
+
+ if (!root_irq_dir ||
+ (irq_desc[irq].handler == &no_irq_type) ||
+ irq_dir[irq])
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%d", irq);
+
+ /* create /proc/irq/1234 */
+ irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+ {
+ struct proc_dir_entry *entry;
+
+ /* create /proc/irq/<irq>/smp_affinity */
+ entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+
+ if (entry) {
+ entry->nlink = 1;
+ entry->data = (void *)(long)irq;
+ entry->read_proc = irq_affinity_read_proc;
+ entry->write_proc = irq_affinity_write_proc;
+ }
+ smp_affinity_entry[irq] = entry;
+ }
+#endif
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ if (action->dir)
+ remove_proc_entry(action->dir->name, irq_dir[irq]);
+}
+
+void init_irq_proc(void)
+{
+ int i;
+
+ /* create /proc/irq */
+ root_irq_dir = proc_mkdir("irq", NULL);
+ if (!root_irq_dir)
+ return;
+
+ /*
+ * Create entries for all existing IRQs.
+ */
+ for (i = 0; i < NR_IRQS; i++)
+ register_irq_proc(i);
+}
+
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000000..f6297c306905
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,96 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ * functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+ struct irqaction *action;
+
+ if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+ printk(KERN_ERR "irq event %d: bogus return value %x\n",
+ irq, action_ret);
+ } else {
+ printk(KERN_ERR "irq %d: nobody cared!\n", irq);
+ }
+ dump_stack();
+ printk(KERN_ERR "handlers:\n");
+ action = desc->action;
+ while (action) {
+ printk(KERN_ERR "[<%p>]", action->handler);
+ print_symbol(" (%s)",
+ (unsigned long)action->handler);
+ printk("\n");
+ action = action->next;
+ }
+}
+
+void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+ static int count = 100;
+
+ if (count > 0) {
+ count--;
+ __report_bad_irq(irq, desc, action_ret);
+ }
+}
+
+void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+ if (action_ret != IRQ_HANDLED) {
+ desc->irqs_unhandled++;
+ if (action_ret != IRQ_NONE)
+ report_bad_irq(irq, desc, action_ret);
+ }
+
+ desc->irq_count++;
+ if (desc->irq_count < 100000)
+ return;
+
+ desc->irq_count = 0;
+ if (desc->irqs_unhandled > 99900) {
+ /*
+ * The interrupt is stuck
+ */
+ __report_bad_irq(irq, desc, action_ret);
+ /*
+ * Now kill the IRQ
+ */
+ printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ desc->status |= IRQ_DISABLED;
+ desc->handler->disable(irq);
+ }
+ desc->irqs_unhandled = 0;
+}
+
+int noirqdebug;
+
+int __init noirqdebug_setup(char *str)
+{
+ noirqdebug = 1;
+ printk(KERN_INFO "IRQ lockup detection disabled\n");
+ return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
diff --git a/kernel/itimer.c b/kernel/itimer.c
new file mode 100644
index 000000000000..e9a40e947e07
--- /dev/null
+++ b/kernel/itimer.c
@@ -0,0 +1,241 @@
+/*
+ * linux/kernel/itimer.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* These are all the functions necessary to implement itimers */
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/posix-timers.h>
+
+#include <asm/uaccess.h>
+
+static unsigned long it_real_value(struct signal_struct *sig)
+{
+ unsigned long val = 0;
+ if (timer_pending(&sig->real_timer)) {
+ val = sig->real_timer.expires - jiffies;
+
+ /* look out for negative/zero itimer.. */
+ if ((long) val <= 0)
+ val = 1;
+ }
+ return val;
+}
+
+int do_getitimer(int which, struct itimerval *value)
+{
+ struct task_struct *tsk = current;
+ unsigned long interval, val;
+ cputime_t cinterval, cval;
+
+ switch (which) {
+ case ITIMER_REAL:
+ spin_lock_irq(&tsk->sighand->siglock);
+ interval = tsk->signal->it_real_incr;
+ val = it_real_value(tsk->signal);
+ spin_unlock_irq(&tsk->sighand->siglock);
+ jiffies_to_timeval(val, &value->it_value);
+ jiffies_to_timeval(interval, &value->it_interval);
+ break;
+ case ITIMER_VIRTUAL:
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_virt_expires;
+ cinterval = tsk->signal->it_virt_incr;
+ if (!cputime_eq(cval, cputime_zero)) {
+ struct task_struct *t = tsk;
+ cputime_t utime = tsk->signal->utime;
+ do {
+ utime = cputime_add(utime, t->utime);
+ t = next_thread(t);
+ } while (t != tsk);
+ if (cputime_le(cval, utime)) { /* about to fire */
+ cval = jiffies_to_cputime(1);
+ } else {
+ cval = cputime_sub(cval, utime);
+ }
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ cputime_to_timeval(cval, &value->it_value);
+ cputime_to_timeval(cinterval, &value->it_interval);
+ break;
+ case ITIMER_PROF:
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_prof_expires;
+ cinterval = tsk->signal->it_prof_incr;
+ if (!cputime_eq(cval, cputime_zero)) {
+ struct task_struct *t = tsk;
+ cputime_t ptime = cputime_add(tsk->signal->utime,
+ tsk->signal->stime);
+ do {
+ ptime = cputime_add(ptime,
+ cputime_add(t->utime,
+ t->stime));
+ t = next_thread(t);
+ } while (t != tsk);
+ if (cputime_le(cval, ptime)) { /* about to fire */
+ cval = jiffies_to_cputime(1);
+ } else {
+ cval = cputime_sub(cval, ptime);
+ }
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ cputime_to_timeval(cval, &value->it_value);
+ cputime_to_timeval(cinterval, &value->it_interval);
+ break;
+ default:
+ return(-EINVAL);
+ }
+ return 0;
+}
+
+asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
+{
+ int error = -EFAULT;
+ struct itimerval get_buffer;
+
+ if (value) {
+ error = do_getitimer(which, &get_buffer);
+ if (!error &&
+ copy_to_user(value, &get_buffer, sizeof(get_buffer)))
+ error = -EFAULT;
+ }
+ return error;
+}
+
+/*
+ * Called with P->sighand->siglock held and P->signal->real_timer inactive.
+ * If interval is nonzero, arm the timer for interval ticks from now.
+ */
+static inline void it_real_arm(struct task_struct *p, unsigned long interval)
+{
+ p->signal->it_real_value = interval; /* XXX unnecessary field?? */
+ if (interval == 0)
+ return;
+ if (interval > (unsigned long) LONG_MAX)
+ interval = LONG_MAX;
+ p->signal->real_timer.expires = jiffies + interval;
+ add_timer(&p->signal->real_timer);
+}
+
+void it_real_fn(unsigned long __data)
+{
+ struct task_struct * p = (struct task_struct *) __data;
+
+ send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
+
+ /*
+ * Now restart the timer if necessary. We don't need any locking
+ * here because do_setitimer makes sure we have finished running
+ * before it touches anything.
+ */
+ it_real_arm(p, p->signal->it_real_incr);
+}
+
+int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+{
+ struct task_struct *tsk = current;
+ unsigned long val, interval;
+ cputime_t cval, cinterval, nval, ninterval;
+
+ switch (which) {
+ case ITIMER_REAL:
+ spin_lock_irq(&tsk->sighand->siglock);
+ interval = tsk->signal->it_real_incr;
+ val = it_real_value(tsk->signal);
+ if (val)
+ del_timer_sync(&tsk->signal->real_timer);
+ tsk->signal->it_real_incr =
+ timeval_to_jiffies(&value->it_interval);
+ it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (ovalue) {
+ jiffies_to_timeval(val, &ovalue->it_value);
+ jiffies_to_timeval(interval,
+ &ovalue->it_interval);
+ }
+ break;
+ case ITIMER_VIRTUAL:
+ nval = timeval_to_cputime(&value->it_value);
+ ninterval = timeval_to_cputime(&value->it_interval);
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_virt_expires;
+ cinterval = tsk->signal->it_virt_incr;
+ if (!cputime_eq(cval, cputime_zero) ||
+ !cputime_eq(nval, cputime_zero)) {
+ if (cputime_gt(nval, cputime_zero))
+ nval = cputime_add(nval,
+ jiffies_to_cputime(1));
+ set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
+ &nval, &cval);
+ }
+ tsk->signal->it_virt_expires = nval;
+ tsk->signal->it_virt_incr = ninterval;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ if (ovalue) {
+ cputime_to_timeval(cval, &ovalue->it_value);
+ cputime_to_timeval(cinterval, &ovalue->it_interval);
+ }
+ break;
+ case ITIMER_PROF:
+ nval = timeval_to_cputime(&value->it_value);
+ ninterval = timeval_to_cputime(&value->it_interval);
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ cval = tsk->signal->it_prof_expires;
+ cinterval = tsk->signal->it_prof_incr;
+ if (!cputime_eq(cval, cputime_zero) ||
+ !cputime_eq(nval, cputime_zero)) {
+ if (cputime_gt(nval, cputime_zero))
+ nval = cputime_add(nval,
+ jiffies_to_cputime(1));
+ set_process_cpu_timer(tsk, CPUCLOCK_PROF,
+ &nval, &cval);
+ }
+ tsk->signal->it_prof_expires = nval;
+ tsk->signal->it_prof_incr = ninterval;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ if (ovalue) {
+ cputime_to_timeval(cval, &ovalue->it_value);
+ cputime_to_timeval(cinterval, &ovalue->it_interval);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+asmlinkage long sys_setitimer(int which,
+ struct itimerval __user *value,
+ struct itimerval __user *ovalue)
+{
+ struct itimerval set_buffer, get_buffer;
+ int error;
+
+ if (value) {
+ if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
+ return -EFAULT;
+ } else
+ memset((char *) &set_buffer, 0, sizeof(set_buffer));
+
+ error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+ if (error || !ovalue)
+ return error;
+
+ if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+ return -EFAULT;
+ return 0;
+}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
new file mode 100644
index 000000000000..1627f8d6e0cd
--- /dev/null
+++ b/kernel/kallsyms.c
@@ -0,0 +1,411 @@
+/*
+ * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
+ *
+ * Rewritten and vastly simplified by Rusty Russell for in-kernel
+ * module loader:
+ * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * ChangeLog:
+ *
+ * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
+ * Changed the compression method from stem compression to "table lookup"
+ * compression (see scripts/kallsyms.c for a more complete description)
+ */
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+
+#ifdef CONFIG_KALLSYMS_ALL
+#define all_var 1
+#else
+#define all_var 0
+#endif
+
+/* These will be re-linked against their real values during the second link stage */
+extern unsigned long kallsyms_addresses[] __attribute__((weak));
+extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
+extern u8 kallsyms_names[] __attribute__((weak));
+
+extern u8 kallsyms_token_table[] __attribute__((weak));
+extern u16 kallsyms_token_index[] __attribute__((weak));
+
+extern unsigned long kallsyms_markers[] __attribute__((weak));
+
+static inline int is_kernel_inittext(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sinittext
+ && addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
+ return 1;
+ return in_gate_area_no_task(addr);
+}
+
+static inline int is_kernel(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+ return 1;
+ return in_gate_area_no_task(addr);
+}
+
+/* expand a compressed symbol data into the resulting uncompressed string,
+ given the offset to where the symbol is in the compressed stream */
+static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+{
+ int len, skipped_first = 0;
+ u8 *tptr, *data;
+
+ /* get the compressed symbol length from the first symbol byte */
+ data = &kallsyms_names[off];
+ len = *data;
+ data++;
+
+ /* update the offset to return the offset for the next symbol on
+ * the compressed stream */
+ off += len + 1;
+
+ /* for every byte on the compressed symbol data, copy the table
+ entry for that byte */
+ while(len) {
+ tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+ data++;
+ len--;
+
+ while (*tptr) {
+ if(skipped_first) {
+ *result = *tptr;
+ result++;
+ } else
+ skipped_first = 1;
+ tptr++;
+ }
+ }
+
+ *result = '\0';
+
+ /* return to offset to the next symbol */
+ return off;
+}
+
+/* get symbol type information. This is encoded as a single char at the
+ * begining of the symbol name */
+static char kallsyms_get_symbol_type(unsigned int off)
+{
+ /* get just the first code, look it up in the token table, and return the
+ * first char from this token */
+ return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+}
+
+
+/* find the offset on the compressed stream given and index in the
+ * kallsyms array */
+static unsigned int get_symbol_offset(unsigned long pos)
+{
+ u8 *name;
+ int i;
+
+ /* use the closest marker we have. We have markers every 256 positions,
+ * so that should be close enough */
+ name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+
+ /* sequentially scan all the symbols up to the point we're searching for.
+ * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
+ * just need to add the len to the current pointer for every symbol we
+ * wish to skip */
+ for(i = 0; i < (pos&0xFF); i++)
+ name = name + (*name) + 1;
+
+ return name - kallsyms_names;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long i;
+ unsigned int off;
+
+ for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+ off = kallsyms_expand_symbol(off, namebuf);
+
+ if (strcmp(namebuf, name) == 0)
+ return kallsyms_addresses[i];
+ }
+ return module_kallsyms_lookup_name(name);
+}
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
+
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel
+ * - we guarantee that the returned name is valid until we reschedule even if
+ * it resides in a module
+ * - we also guarantee that modname will be valid until rescheduled
+ */
+const char *kallsyms_lookup(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset,
+ char **modname, char *namebuf)
+{
+ unsigned long i, low, high, mid;
+ const char *msym;
+
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
+ namebuf[KSYM_NAME_LEN] = 0;
+ namebuf[0] = 0;
+
+ if ((all_var && is_kernel(addr)) ||
+ (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
+ unsigned long symbol_end=0;
+
+ /* do a binary search on the sorted kallsyms_addresses array */
+ low = 0;
+ high = kallsyms_num_syms;
+
+ while (high-low > 1) {
+ mid = (low + high) / 2;
+ if (kallsyms_addresses[mid] <= addr) low = mid;
+ else high = mid;
+ }
+
+ /* search for the first aliased symbol. Aliased symbols are
+ symbols with the same address */
+ while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
+ --low;
+
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
+
+ /* Search for next non-aliased symbol */
+ for (i = low + 1; i < kallsyms_num_syms; i++) {
+ if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
+ symbol_end = kallsyms_addresses[i];
+ break;
+ }
+ }
+
+ /* if we found no next symbol, we use the end of the section */
+ if (!symbol_end) {
+ if (is_kernel_inittext(addr))
+ symbol_end = (unsigned long)_einittext;
+ else
+ symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
+ }
+
+ *symbolsize = symbol_end - kallsyms_addresses[low];
+ *modname = NULL;
+ *offset = addr - kallsyms_addresses[low];
+ return namebuf;
+ }
+
+ /* see if it's in a module */
+ msym = module_address_lookup(addr, symbolsize, offset, modname);
+ if (msym)
+ return strncpy(namebuf, msym, KSYM_NAME_LEN);
+
+ return NULL;
+}
+
+/* Replace "%s" in format with address, or returns -errno. */
+void __print_symbol(const char *fmt, unsigned long address)
+{
+ char *modname;
+ const char *name;
+ unsigned long offset, size;
+ char namebuf[KSYM_NAME_LEN+1];
+ char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
+ 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
+
+ name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+
+ if (!name)
+ sprintf(buffer, "0x%lx", address);
+ else {
+ if (modname)
+ sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
+ size, modname);
+ else
+ sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+ }
+ printk(fmt, buffer);
+}
+
+/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
+struct kallsym_iter
+{
+ loff_t pos;
+ struct module *owner;
+ unsigned long value;
+ unsigned int nameoff; /* If iterating in core kernel symbols */
+ char type;
+ char name[KSYM_NAME_LEN+1];
+};
+
+/* Only label it "global" if it is exported. */
+static void upcase_if_global(struct kallsym_iter *iter)
+{
+ if (is_exported(iter->name, iter->owner))
+ iter->type += 'A' - 'a';
+}
+
+static int get_ksymbol_mod(struct kallsym_iter *iter)
+{
+ iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
+ &iter->value,
+ &iter->type, iter->name);
+ if (iter->owner == NULL)
+ return 0;
+
+ upcase_if_global(iter);
+ return 1;
+}
+
+/* Returns space to next name. */
+static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
+{
+ unsigned off = iter->nameoff;
+
+ iter->owner = NULL;
+ iter->value = kallsyms_addresses[iter->pos];
+
+ iter->type = kallsyms_get_symbol_type(off);
+
+ off = kallsyms_expand_symbol(off, iter->name);
+
+ return off - iter->nameoff;
+}
+
+static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
+{
+ iter->name[0] = '\0';
+ iter->nameoff = get_symbol_offset(new_pos);
+ iter->pos = new_pos;
+}
+
+/* Returns false if pos at or past end of file. */
+static int update_iter(struct kallsym_iter *iter, loff_t pos)
+{
+ /* Module symbols can be accessed randomly. */
+ if (pos >= kallsyms_num_syms) {
+ iter->pos = pos;
+ return get_ksymbol_mod(iter);
+ }
+
+ /* If we're not on the desired position, reset to new position. */
+ if (pos != iter->pos)
+ reset_iter(iter, pos);
+
+ iter->nameoff += get_ksymbol_core(iter);
+ iter->pos++;
+
+ return 1;
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ (*pos)++;
+
+ if (!update_iter(m->private, *pos))
+ return NULL;
+ return p;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ if (!update_iter(m->private, *pos))
+ return NULL;
+ return m->private;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kallsym_iter *iter = m->private;
+
+ /* Some debugging symbols have no name. Ignore them. */
+ if (!iter->name[0])
+ return 0;
+
+ if (iter->owner)
+ seq_printf(m, "%0*lx %c %s\t[%s]\n",
+ (int)(2*sizeof(void*)),
+ iter->value, iter->type, iter->name,
+ module_name(iter->owner));
+ else
+ seq_printf(m, "%0*lx %c %s\n",
+ (int)(2*sizeof(void*)),
+ iter->value, iter->type, iter->name);
+ return 0;
+}
+
+static struct seq_operations kallsyms_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show
+};
+
+static int kallsyms_open(struct inode *inode, struct file *file)
+{
+ /* We keep iterator in m->private, since normal case is to
+ * s_start from where we left off, so we avoid doing
+ * using get_symbol_offset for every symbol */
+ struct kallsym_iter *iter;
+ int ret;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+ reset_iter(iter, 0);
+
+ ret = seq_open(file, &kallsyms_op);
+ if (ret == 0)
+ ((struct seq_file *)file->private_data)->private = iter;
+ else
+ kfree(iter);
+ return ret;
+}
+
+static int kallsyms_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ kfree(m->private);
+ return seq_release(inode, file);
+}
+
+static struct file_operations kallsyms_operations = {
+ .open = kallsyms_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kallsyms_release,
+};
+
+static int __init kallsyms_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("kallsyms", 0444, NULL);
+ if (entry)
+ entry->proc_fops = &kallsyms_operations;
+ return 0;
+}
+__initcall(kallsyms_init);
+
+EXPORT_SYMBOL(__print_symbol);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
new file mode 100644
index 000000000000..179baafcdd96
--- /dev/null
+++ b/kernel/kfifo.c
@@ -0,0 +1,168 @@
+/*
+ * A simple kernel FIFO implementation.
+ *
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/kfifo.h>
+
+/**
+ * kfifo_init - allocates a new FIFO using a preallocated buffer
+ * @buffer: the preallocated buffer to be used.
+ * @size: the size of the internal buffer, this have to be a power of 2.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the
+ * struct kfifo with kfree().
+ */
+struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+ unsigned int __nocast gfp_mask, spinlock_t *lock)
+{
+ struct kfifo *fifo;
+
+ /* size must be a power of 2 */
+ BUG_ON(size & (size - 1));
+
+ fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
+ if (!fifo)
+ return ERR_PTR(-ENOMEM);
+
+ fifo->buffer = buffer;
+ fifo->size = size;
+ fifo->in = fifo->out = 0;
+ fifo->lock = lock;
+
+ return fifo;
+}
+EXPORT_SYMBOL(kfifo_init);
+
+/**
+ * kfifo_alloc - allocates a new FIFO and its internal buffer
+ * @size: the size of the internal buffer to be allocated.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * The size will be rounded-up to a power of 2.
+ */
+struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
+{
+ unsigned char *buffer;
+ struct kfifo *ret;
+
+ /*
+ * round up to the next power of 2, since our 'let the indices
+ * wrap' tachnique works only in this case.
+ */
+ if (size & (size - 1)) {
+ BUG_ON(size > 0x80000000);
+ size = roundup_pow_of_two(size);
+ }
+
+ buffer = kmalloc(size, gfp_mask);
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kfifo_init(buffer, size, gfp_mask, lock);
+
+ if (IS_ERR(ret))
+ kfree(buffer);
+
+ return ret;
+}
+EXPORT_SYMBOL(kfifo_alloc);
+
+/**
+ * kfifo_free - frees the FIFO
+ * @fifo: the fifo to be freed.
+ */
+void kfifo_free(struct kfifo *fifo)
+{
+ kfree(fifo->buffer);
+ kfree(fifo);
+}
+EXPORT_SYMBOL(kfifo_free);
+
+/**
+ * __kfifo_put - puts some data into the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most 'len' bytes from the 'buffer' into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_put(struct kfifo *fifo,
+ unsigned char *buffer, unsigned int len)
+{
+ unsigned int l;
+
+ len = min(len, fifo->size - fifo->in + fifo->out);
+
+ /* first put the data starting from fifo->in to buffer end */
+ l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+ memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+
+ /* then put the rest (if any) at the beginning of the buffer */
+ memcpy(fifo->buffer, buffer + l, len - l);
+
+ fifo->in += len;
+
+ return len;
+}
+EXPORT_SYMBOL(__kfifo_put);
+
+/**
+ * __kfifo_get - gets some data from the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: where the data must be copied.
+ * @len: the size of the destination buffer.
+ *
+ * This function copies at most 'len' bytes from the FIFO into the
+ * 'buffer' and returns the number of copied bytes.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_get(struct kfifo *fifo,
+ unsigned char *buffer, unsigned int len)
+{
+ unsigned int l;
+
+ len = min(len, fifo->in - fifo->out);
+
+ /* first get the data from fifo->out until the end of the buffer */
+ l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+ memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+
+ /* then get the rest (if any) from the beginning of the buffer */
+ memcpy(buffer + l, fifo->buffer, len - l);
+
+ fifo->out += len;
+
+ return len;
+}
+EXPORT_SYMBOL(__kfifo_get);
diff --git a/kernel/kmod.c b/kernel/kmod.c
new file mode 100644
index 000000000000..eed53d4f5230
--- /dev/null
+++ b/kernel/kmod.c
@@ -0,0 +1,256 @@
+/*
+ kmod, the new module loader (replaces kerneld)
+ Kirk Petersen
+
+ Reorganized not to be a daemon by Adam Richter, with guidance
+ from Greg Zornetzer.
+
+ Modified to avoid chroot and file sharing problems.
+ Mikael Pettersson
+
+ Limit the concurrent number of kmod modprobes to catch loops from
+ "modprobe needs a service that is in a module".
+ Keith Owens <kaos@ocs.com.au> December 1999
+
+ Unblock all signals when we exec a usermode process.
+ Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
+
+ call_usermodehelper wait flag, and remove exec_usermodehelper.
+ Rusty Russell <rusty@rustcorp.com.au> Jan 2003
+*/
+#define __KERNEL_SYSCALLS__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/namespace.h>
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+extern int max_threads;
+
+static struct workqueue_struct *khelper_wq;
+
+#ifdef CONFIG_KMOD
+
+/*
+ modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+
+/**
+ * request_module - try to load a kernel module
+ * @fmt: printf style format string for the name of the module
+ * @varargs: arguements as specified in the format string
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code on failure. Note that a
+ * successful module load does not mean the module did not then unload
+ * and exit on an error of its own. Callers must check that the service
+ * they requested is now available not blindly invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * becomes a no-operation.
+ */
+int request_module(const char *fmt, ...)
+{
+ va_list args;
+ char module_name[MODULE_NAME_LEN];
+ unsigned int max_modprobes;
+ int ret;
+ char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL };
+ static atomic_t kmod_concurrent = ATOMIC_INIT(0);
+#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
+ static int kmod_loop_msg;
+
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+ if (ret >= MODULE_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ /* If modprobe needs a service that is in a module, we get a recursive
+ * loop. Limit the number of running kmod threads to max_threads/2 or
+ * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
+ * would be to run the parents of this process, counting how many times
+ * kmod was invoked. That would mean accessing the internals of the
+ * process tables to get the command line, proc_pid_cmdline is static
+ * and it is not worth changing the proc code just to handle this case.
+ * KAO.
+ *
+ * "trace the ppid" is simple, but will fail if someone's
+ * parent exits. I think this is as good as it gets. --RR
+ */
+ max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
+ atomic_inc(&kmod_concurrent);
+ if (atomic_read(&kmod_concurrent) > max_modprobes) {
+ /* We may be blaming an innocent here, but unlikely */
+ if (kmod_loop_msg++ < 5)
+ printk(KERN_ERR
+ "request_module: runaway loop modprobe %s\n",
+ module_name);
+ atomic_dec(&kmod_concurrent);
+ return -ENOMEM;
+ }
+
+ ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+ atomic_dec(&kmod_concurrent);
+ return ret;
+}
+EXPORT_SYMBOL(request_module);
+#endif /* CONFIG_KMOD */
+
+struct subprocess_info {
+ struct completion *complete;
+ char *path;
+ char **argv;
+ char **envp;
+ int wait;
+ int retval;
+};
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int ____call_usermodehelper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ int retval;
+
+ /* Unblock all signals. */
+ flush_signals(current);
+ spin_lock_irq(&current->sighand->siglock);
+ flush_signal_handlers(current, 1);
+ sigemptyset(&current->blocked);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /* We can run anywhere, unlike our parent keventd(). */
+ set_cpus_allowed(current, CPU_MASK_ALL);
+
+ retval = -EPERM;
+ if (current->fs->root)
+ retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
+
+ /* Exec failed? */
+ sub_info->retval = retval;
+ do_exit(0);
+}
+
+/* Keventd can't block, but this (a child) can. */
+static int wait_for_helper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ pid_t pid;
+ struct k_sigaction sa;
+
+ /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
+ * populate the status, but will return -ECHILD. */
+ sa.sa.sa_handler = SIG_IGN;
+ sa.sa.sa_flags = 0;
+ siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+ do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+ allow_signal(SIGCHLD);
+
+ pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But wait_for_helper() always runs as keventd, and put_user()
+ * to a kernel address works OK for kernel threads, due to their
+ * having an mm_segment_t which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+ }
+
+ complete(sub_info->complete);
+ return 0;
+}
+
+/* This is run by khelper thread */
+static void __call_usermodehelper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ pid_t pid;
+
+ /* CLONE_VFORK: wait until the usermode helper has execve'd
+ * successfully We need the data structures to stay around
+ * until that is done. */
+ if (sub_info->wait)
+ pid = kernel_thread(wait_for_helper, sub_info,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ CLONE_VFORK | SIGCHLD);
+
+ if (pid < 0) {
+ sub_info->retval = pid;
+ complete(sub_info->complete);
+ } else if (!sub_info->wait)
+ complete(sub_info->complete);
+}
+
+/**
+ * call_usermodehelper - start a usermode application
+ * @path: pathname for the application
+ * @argv: null-terminated argument list
+ * @envp: null-terminated environment list
+ * @wait: wait for the application to finish and return status.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of keventd.
+ * (ie. it runs with full root capabilities).
+ *
+ * Must be called from process context. Returns a negative error code
+ * if program was not execed successfully, or 0.
+ */
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+ DECLARE_COMPLETION(done);
+ struct subprocess_info sub_info = {
+ .complete = &done,
+ .path = path,
+ .argv = argv,
+ .envp = envp,
+ .wait = wait,
+ .retval = 0,
+ };
+ DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
+ if (!khelper_wq)
+ return -EBUSY;
+
+ if (path[0] == '\0')
+ return 0;
+
+ queue_work(khelper_wq, &work);
+ wait_for_completion(&done);
+ return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+void __init usermodehelper_init(void)
+{
+ khelper_wq = create_singlethread_workqueue("khelper");
+ BUG_ON(!khelper_wq);
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
new file mode 100644
index 000000000000..1d5dd1337bd1
--- /dev/null
+++ b/kernel/kprobes.c
@@ -0,0 +1,157 @@
+/*
+ * Kernel Probes (KProbes)
+ * kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation (includes suggestions from
+ * Rusty Russell).
+ * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ * hlists and exceptions notifier as suggested by Andi Kleen.
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ * exceptions notifier to be first on the priority list.
+ */
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/kdebug.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+
+unsigned int kprobe_cpu = NR_CPUS;
+static DEFINE_SPINLOCK(kprobe_lock);
+
+/* Locks kprobe: irqs must be disabled */
+void lock_kprobes(void)
+{
+ spin_lock(&kprobe_lock);
+ kprobe_cpu = smp_processor_id();
+}
+
+void unlock_kprobes(void)
+{
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+}
+
+/* You have to be holding the kprobe_lock */
+struct kprobe *get_kprobe(void *addr)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ hlist_for_each(node, head) {
+ struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret = 0;
+ unsigned long flags = 0;
+
+ if ((ret = arch_prepare_kprobe(p)) != 0) {
+ goto rm_kprobe;
+ }
+ spin_lock_irqsave(&kprobe_lock, flags);
+ INIT_HLIST_NODE(&p->hlist);
+ if (get_kprobe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ arch_copy_kprobe(p);
+
+ hlist_add_head(&p->hlist,
+ &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ p->opcode = *p->addr;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range((unsigned long) p->addr,
+ (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+out:
+ spin_unlock_irqrestore(&kprobe_lock, flags);
+rm_kprobe:
+ if (ret == -EEXIST)
+ arch_remove_kprobe(p);
+ return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+ unsigned long flags;
+ arch_remove_kprobe(p);
+ spin_lock_irqsave(&kprobe_lock, flags);
+ *p->addr = p->opcode;
+ hlist_del(&p->hlist);
+ flush_icache_range((unsigned long) p->addr,
+ (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
+static struct notifier_block kprobe_exceptions_nb = {
+ .notifier_call = kprobe_exceptions_notify,
+ .priority = 0x7fffffff /* we need to notified first */
+};
+
+int register_jprobe(struct jprobe *jp)
+{
+ /* Todo: Verify probepoint is a function entry point */
+ jp->kp.pre_handler = setjmp_pre_handler;
+ jp->kp.break_handler = longjmp_break_handler;
+
+ return register_kprobe(&jp->kp);
+}
+
+void unregister_jprobe(struct jprobe *jp)
+{
+ unregister_kprobe(&jp->kp);
+}
+
+static int __init init_kprobes(void)
+{
+ int i, err = 0;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&kprobe_table[i]);
+
+ err = register_die_notifier(&kprobe_exceptions_nb);
+ return err;
+}
+
+__initcall(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
+EXPORT_SYMBOL_GPL(register_jprobe);
+EXPORT_SYMBOL_GPL(unregister_jprobe);
+EXPORT_SYMBOL_GPL(jprobe_return);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
new file mode 100644
index 000000000000..1f064a63f8cf
--- /dev/null
+++ b/kernel/ksysfs.c
@@ -0,0 +1,57 @@
+/*
+ * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
+ * are not related to any other subsystem
+ *
+ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
+ *
+ * This file is release under the GPLv2
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+#ifdef CONFIG_HOTPLUG
+static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+}
+KERNEL_ATTR_RO(hotplug_seqnum);
+#endif
+
+decl_subsys(kernel, NULL, NULL);
+EXPORT_SYMBOL_GPL(kernel_subsys);
+
+static struct attribute * kernel_attrs[] = {
+#ifdef CONFIG_HOTPLUG
+ &hotplug_seqnum_attr.attr,
+#endif
+ NULL
+};
+
+static struct attribute_group kernel_attr_group = {
+ .attrs = kernel_attrs,
+};
+
+static int __init ksysfs_init(void)
+{
+ int error = subsystem_register(&kernel_subsys);
+ if (!error)
+ error = sysfs_create_group(&kernel_subsys.kset.kobj,
+ &kernel_attr_group);
+
+ return error;
+}
+
+core_initcall(ksysfs_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
new file mode 100644
index 000000000000..e377e2244103
--- /dev/null
+++ b/kernel/kthread.c
@@ -0,0 +1,202 @@
+/* Kernel thread helper functions.
+ * Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ *
+ * Creation is done via keventd, so that we get a clean environment
+ * even if we're invoked from userspace (think modprobe, hotplug cpu,
+ * etc.).
+ */
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/unistd.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <asm/semaphore.h>
+
+/*
+ * We dont want to execute off keventd since it might
+ * hold a semaphore our callers hold too:
+ */
+static struct workqueue_struct *helper_wq;
+
+struct kthread_create_info
+{
+ /* Information passed to kthread() from keventd. */
+ int (*threadfn)(void *data);
+ void *data;
+ struct completion started;
+
+ /* Result passed back to kthread_create() from keventd. */
+ struct task_struct *result;
+ struct completion done;
+};
+
+struct kthread_stop_info
+{
+ struct task_struct *k;
+ int err;
+ struct completion done;
+};
+
+/* Thread stopping is done by setthing this var: lock serializes
+ * multiple kthread_stop calls. */
+static DECLARE_MUTEX(kthread_stop_lock);
+static struct kthread_stop_info kthread_stop_info;
+
+int kthread_should_stop(void)
+{
+ return (kthread_stop_info.k == current);
+}
+EXPORT_SYMBOL(kthread_should_stop);
+
+static void kthread_exit_files(void)
+{
+ struct fs_struct *fs;
+ struct task_struct *tsk = current;
+
+ exit_fs(tsk); /* current->fs->count--; */
+ fs = init_task.fs;
+ tsk->fs = fs;
+ atomic_inc(&fs->count);
+ exit_files(tsk);
+ current->files = init_task.files;
+ atomic_inc(&tsk->files->count);
+}
+
+static int kthread(void *_create)
+{
+ struct kthread_create_info *create = _create;
+ int (*threadfn)(void *data);
+ void *data;
+ sigset_t blocked;
+ int ret = -EINTR;
+
+ kthread_exit_files();
+
+ /* Copy data: it's on keventd's stack */
+ threadfn = create->threadfn;
+ data = create->data;
+
+ /* Block and flush all signals (in case we're not from keventd). */
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ /* By default we can run anywhere, unlike keventd. */
+ set_cpus_allowed(current, CPU_MASK_ALL);
+
+ /* OK, tell user we're spawned, wait for stop or wakeup */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ complete(&create->started);
+ schedule();
+
+ if (!kthread_should_stop())
+ ret = threadfn(data);
+
+ /* It might have exited on its own, w/o kthread_stop. Check. */
+ if (kthread_should_stop()) {
+ kthread_stop_info.err = ret;
+ complete(&kthread_stop_info.done);
+ }
+ return 0;
+}
+
+/* We are keventd: create a thread. */
+static void keventd_create_kthread(void *_create)
+{
+ struct kthread_create_info *create = _create;
+ int pid;
+
+ /* We want our own signal handler (we take no signals by default). */
+ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ if (pid < 0) {
+ create->result = ERR_PTR(pid);
+ } else {
+ wait_for_completion(&create->started);
+ create->result = find_task_by_pid(pid);
+ }
+ complete(&create->done);
+}
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+ void *data,
+ const char namefmt[],
+ ...)
+{
+ struct kthread_create_info create;
+ DECLARE_WORK(work, keventd_create_kthread, &create);
+
+ create.threadfn = threadfn;
+ create.data = data;
+ init_completion(&create.started);
+ init_completion(&create.done);
+
+ /*
+ * The workqueue needs to start up first:
+ */
+ if (!helper_wq)
+ work.func(work.data);
+ else {
+ queue_work(helper_wq, &work);
+ wait_for_completion(&create.done);
+ }
+ if (!IS_ERR(create.result)) {
+ va_list args;
+ va_start(args, namefmt);
+ vsnprintf(create.result->comm, sizeof(create.result->comm),
+ namefmt, args);
+ va_end(args);
+ }
+
+ return create.result;
+}
+EXPORT_SYMBOL(kthread_create);
+
+void kthread_bind(struct task_struct *k, unsigned int cpu)
+{
+ BUG_ON(k->state != TASK_INTERRUPTIBLE);
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ wait_task_inactive(k);
+ set_task_cpu(k, cpu);
+ k->cpus_allowed = cpumask_of_cpu(cpu);
+}
+EXPORT_SYMBOL(kthread_bind);
+
+int kthread_stop(struct task_struct *k)
+{
+ int ret;
+
+ down(&kthread_stop_lock);
+
+ /* It could exit after stop_info.k set, but before wake_up_process. */
+ get_task_struct(k);
+
+ /* Must init completion *before* thread sees kthread_stop_info.k */
+ init_completion(&kthread_stop_info.done);
+ wmb();
+
+ /* Now set kthread_should_stop() to true, and wake it up. */
+ kthread_stop_info.k = k;
+ wake_up_process(k);
+ put_task_struct(k);
+
+ /* Once it dies, reset stop ptr, gather result and we're done. */
+ wait_for_completion(&kthread_stop_info.done);
+ kthread_stop_info.k = NULL;
+ ret = kthread_stop_info.err;
+ up(&kthread_stop_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop);
+
+static __init int helper_init(void)
+{
+ helper_wq = create_singlethread_workqueue("kthread");
+ BUG_ON(!helper_wq);
+
+ return 0;
+}
+core_initcall(helper_init);
+
diff --git a/kernel/module.c b/kernel/module.c
new file mode 100644
index 000000000000..2dbfa0773faf
--- /dev/null
+++ b/kernel/module.c
@@ -0,0 +1,2108 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+ Copyright (C) 2002 Richard Henderson
+ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/stop_machine.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <asm/cacheflush.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , a...)
+#endif
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
+
+/* Protects module list */
+static DEFINE_SPINLOCK(modlist_lock);
+
+/* List of modules, protected by module_mutex AND modlist_lock */
+static DECLARE_MUTEX(module_mutex);
+static LIST_HEAD(modules);
+
+static DECLARE_MUTEX(notify_mutex);
+static struct notifier_block * module_notify_list;
+
+int register_module_notifier(struct notifier_block * nb)
+{
+ int err;
+ down(&notify_mutex);
+ err = notifier_chain_register(&module_notify_list, nb);
+ up(&notify_mutex);
+ return err;
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block * nb)
+{
+ int err;
+ down(&notify_mutex);
+ err = notifier_chain_unregister(&module_notify_list, nb);
+ up(&notify_mutex);
+ return err;
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/* We require a truly strong try_module_get() */
+static inline int strong_try_module_get(struct module *mod)
+{
+ if (mod && mod->state == MODULE_STATE_COMING)
+ return 0;
+ return try_module_get(mod);
+}
+
+/* A thread that wants to hold a reference to a module only while it
+ * is running can call ths to safely exit.
+ * nfsd and lockd use this.
+ */
+void __module_put_and_exit(struct module *mod, long code)
+{
+ module_put(mod);
+ do_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < hdr->e_shnum; i++)
+ /* Alloc bit cleared means "ignore it." */
+ if ((sechdrs[i].sh_flags & SHF_ALLOC)
+ && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+ return i;
+ return 0;
+}
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const unsigned long __start___kcrctab[];
+extern const unsigned long __start___kcrctab_gpl[];
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
+#endif
+
+/* Find a symbol, return value, crc and module which owns it */
+static unsigned long __find_symbol(const char *name,
+ struct module **owner,
+ const unsigned long **crc,
+ int gplok)
+{
+ struct module *mod;
+ unsigned int i;
+
+ /* Core kernel first. */
+ *owner = NULL;
+ for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
+ if (strcmp(__start___ksymtab[i].name, name) == 0) {
+ *crc = symversion(__start___kcrctab, i);
+ return __start___ksymtab[i].value;
+ }
+ }
+ if (gplok) {
+ for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
+ if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
+ *crc = symversion(__start___kcrctab_gpl, i);
+ return __start___ksymtab_gpl[i].value;
+ }
+ }
+
+ /* Now try modules. */
+ list_for_each_entry(mod, &modules, list) {
+ *owner = mod;
+ for (i = 0; i < mod->num_syms; i++)
+ if (strcmp(mod->syms[i].name, name) == 0) {
+ *crc = symversion(mod->crcs, i);
+ return mod->syms[i].value;
+ }
+
+ if (gplok) {
+ for (i = 0; i < mod->num_gpl_syms; i++) {
+ if (strcmp(mod->gpl_syms[i].name, name) == 0) {
+ *crc = symversion(mod->gpl_crcs, i);
+ return mod->gpl_syms[i].value;
+ }
+ }
+ }
+ }
+ DEBUGP("Failed to find symbol %s\n", name);
+ return 0;
+}
+
+/* Find a symbol in this elf symbol table */
+static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ const char *strtab,
+ const char *name)
+{
+ unsigned int i;
+ Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+
+ /* Search (defined) internal symbols first. */
+ for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
+ if (sym[i].st_shndx != SHN_UNDEF
+ && strcmp(name, strtab + sym[i].st_name) == 0)
+ return sym[i].st_value;
+ }
+ return 0;
+}
+
+/* Search for module by name: must hold module_mutex. */
+static struct module *find_module(const char *name)
+{
+ struct module *mod;
+
+ list_for_each_entry(mod, &modules, list) {
+ if (strcmp(mod->name, name) == 0)
+ return mod;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_SMP
+/* Number of blocks used and allocated. */
+static unsigned int pcpu_num_used, pcpu_num_allocated;
+/* Size of each block. -ve means used. */
+static int *pcpu_size;
+
+static int split_block(unsigned int i, unsigned short size)
+{
+ /* Reallocation required? */
+ if (pcpu_num_used + 1 > pcpu_num_allocated) {
+ int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
+ GFP_KERNEL);
+ if (!new)
+ return 0;
+
+ memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
+ pcpu_num_allocated *= 2;
+ kfree(pcpu_size);
+ pcpu_size = new;
+ }
+
+ /* Insert a new subblock */
+ memmove(&pcpu_size[i+1], &pcpu_size[i],
+ sizeof(pcpu_size[0]) * (pcpu_num_used - i));
+ pcpu_num_used++;
+
+ pcpu_size[i+1] -= size;
+ pcpu_size[i] = size;
+ return 1;
+}
+
+static inline unsigned int block_size(int val)
+{
+ if (val < 0)
+ return -val;
+ return val;
+}
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+static void *percpu_modalloc(unsigned long size, unsigned long align)
+{
+ unsigned long extra;
+ unsigned int i;
+ void *ptr;
+
+ BUG_ON(align > SMP_CACHE_BYTES);
+
+ ptr = __per_cpu_start;
+ for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+ /* Extra for alignment requirement. */
+ extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+ BUG_ON(i == 0 && extra != 0);
+
+ if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
+ continue;
+
+ /* Transfer extra to previous block. */
+ if (pcpu_size[i-1] < 0)
+ pcpu_size[i-1] -= extra;
+ else
+ pcpu_size[i-1] += extra;
+ pcpu_size[i] -= extra;
+ ptr += extra;
+
+ /* Split block if warranted */
+ if (pcpu_size[i] - size > sizeof(unsigned long))
+ if (!split_block(i, size))
+ return NULL;
+
+ /* Mark allocated */
+ pcpu_size[i] = -pcpu_size[i];
+ return ptr;
+ }
+
+ printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
+ size);
+ return NULL;
+}
+
+static void percpu_modfree(void *freeme)
+{
+ unsigned int i;
+ void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+
+ /* First entry is core kernel percpu data. */
+ for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+ if (ptr == freeme) {
+ pcpu_size[i] = -pcpu_size[i];
+ goto free;
+ }
+ }
+ BUG();
+
+ free:
+ /* Merge with previous? */
+ if (pcpu_size[i-1] >= 0) {
+ pcpu_size[i-1] += pcpu_size[i];
+ pcpu_num_used--;
+ memmove(&pcpu_size[i], &pcpu_size[i+1],
+ (pcpu_num_used - i) * sizeof(pcpu_size[0]));
+ i--;
+ }
+ /* Merge with next? */
+ if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
+ pcpu_size[i] += pcpu_size[i+1];
+ pcpu_num_used--;
+ memmove(&pcpu_size[i+1], &pcpu_size[i+2],
+ (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
+ }
+}
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static int percpu_modinit(void)
+{
+ pcpu_num_used = 2;
+ pcpu_num_allocated = 2;
+ pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
+ GFP_KERNEL);
+ /* Static in-kernel percpu data (used). */
+ pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+ /* Free room. */
+ pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
+ if (pcpu_size[1] < 0) {
+ printk(KERN_ERR "No per-cpu room for modules.\n");
+ pcpu_num_used = 1;
+ }
+
+ return 0;
+}
+__initcall(percpu_modinit);
+#else /* ... !CONFIG_SMP */
+static inline void *percpu_modalloc(unsigned long size, unsigned long align)
+{
+ return NULL;
+}
+static inline void percpu_modfree(void *pcpuptr)
+{
+ BUG();
+}
+static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ return 0;
+}
+static inline void percpu_modcopy(void *pcpudst, const void *src,
+ unsigned long size)
+{
+ /* pcpusec should be 0, and size of that section should be 0. */
+ BUG_ON(size != 0);
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_MODULE_UNLOAD
+/* Init the unload section of the module. */
+static void module_unload_init(struct module *mod)
+{
+ unsigned int i;
+
+ INIT_LIST_HEAD(&mod->modules_which_use_me);
+ for (i = 0; i < NR_CPUS; i++)
+ local_set(&mod->ref[i].count, 0);
+ /* Hold reference count during initialization. */
+ local_set(&mod->ref[_smp_processor_id()].count, 1);
+ /* Backwards compatibility macros put refcount during init. */
+ mod->waiter = current;
+}
+
+/* modules using other modules */
+struct module_use
+{
+ struct list_head list;
+ struct module *module_which_uses;
+};
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ list_for_each_entry(use, &b->modules_which_use_me, list) {
+ if (use->module_which_uses == a) {
+ DEBUGP("%s uses %s!\n", a->name, b->name);
+ return 1;
+ }
+ }
+ DEBUGP("%s does not use %s!\n", a->name, b->name);
+ return 0;
+}
+
+/* Module a uses b */
+static int use_module(struct module *a, struct module *b)
+{
+ struct module_use *use;
+ if (b == NULL || already_uses(a, b)) return 1;
+
+ if (!strong_try_module_get(b))
+ return 0;
+
+ DEBUGP("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use) {
+ printk("%s: out of memory loading\n", a->name);
+ module_put(b);
+ return 0;
+ }
+
+ use->module_which_uses = a;
+ list_add(&use->list, &b->modules_which_use_me);
+ return 1;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+ struct module *i;
+
+ list_for_each_entry(i, &modules, list) {
+ struct module_use *use;
+
+ list_for_each_entry(use, &i->modules_which_use_me, list) {
+ if (use->module_which_uses == mod) {
+ DEBUGP("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->list);
+ kfree(use);
+ /* There can be at most one match. */
+ break;
+ }
+ }
+ }
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force(unsigned int flags)
+{
+ int ret = (flags & O_TRUNC);
+ if (ret)
+ tainted |= TAINT_FORCED_MODULE;
+ return ret;
+}
+#else
+static inline int try_force(unsigned int flags)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+struct stopref
+{
+ struct module *mod;
+ int flags;
+ int *forced;
+};
+
+/* Whole machine is stopped with interrupts off when this runs. */
+static int __try_stop_module(void *_sref)
+{
+ struct stopref *sref = _sref;
+
+ /* If it's not unused, quit unless we are told to block. */
+ if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
+ if (!(*sref->forced = try_force(sref->flags)))
+ return -EWOULDBLOCK;
+ }
+
+ /* Mark it as dying. */
+ sref->mod->state = MODULE_STATE_GOING;
+ return 0;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+ struct stopref sref = { mod, flags, forced };
+
+ return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+}
+
+unsigned int module_refcount(struct module *mod)
+{
+ unsigned int i, total = 0;
+
+ for (i = 0; i < NR_CPUS; i++)
+ total += local_read(&mod->ref[i].count);
+ return total;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+static void wait_for_zero_refcount(struct module *mod)
+{
+ /* Since we might sleep for some time, drop the semaphore first */
+ up(&module_mutex);
+ for (;;) {
+ DEBUGP("Looking at refcount...\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (module_refcount(mod) == 0)
+ break;
+ schedule();
+ }
+ current->state = TASK_RUNNING;
+ down(&module_mutex);
+}
+
+asmlinkage long
+sys_delete_module(const char __user *name_user, unsigned int flags)
+{
+ struct module *mod;
+ char name[MODULE_NAME_LEN];
+ int ret, forced = 0;
+
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+
+ if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+ return -EFAULT;
+ name[MODULE_NAME_LEN-1] = '\0';
+
+ if (down_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ mod = find_module(name);
+ if (!mod) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!list_empty(&mod->modules_which_use_me)) {
+ /* Other modules depend on us: get rid of them first. */
+ ret = -EWOULDBLOCK;
+ goto out;
+ }
+
+ /* Doing init or already dying? */
+ if (mod->state != MODULE_STATE_LIVE) {
+ /* FIXME: if (force), slam module count and wake up
+ waiter --RR */
+ DEBUGP("%s already dying\n", mod->name);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* If it has an init func, it must have an exit func to unload */
+ if ((mod->init != NULL && mod->exit == NULL)
+ || mod->unsafe) {
+ forced = try_force(flags);
+ if (!forced) {
+ /* This module can't be removed */
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /* Set this up before setting mod->state */
+ mod->waiter = current;
+
+ /* Stop the machine so refcounts can't move and disable module. */
+ ret = try_stop_module(mod, flags, &forced);
+ if (ret != 0)
+ goto out;
+
+ /* Never wait if forced. */
+ if (!forced && module_refcount(mod) != 0)
+ wait_for_zero_refcount(mod);
+
+ /* Final destruction now noone is using it. */
+ if (mod->exit != NULL) {
+ up(&module_mutex);
+ mod->exit();
+ down(&module_mutex);
+ }
+ free_module(mod);
+
+ out:
+ up(&module_mutex);
+ return ret;
+}
+
+static void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %u ", module_refcount(mod));
+
+ /* Always include a trailing , so userspace can differentiate
+ between this and the old multi-field proc format. */
+ list_for_each_entry(use, &mod->modules_which_use_me, list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->module_which_uses->name);
+ }
+
+ if (mod->unsafe) {
+ printed_something = 1;
+ seq_printf(m, "[unsafe],");
+ }
+
+ if (mod->init != NULL && mod->exit == NULL) {
+ printed_something = 1;
+ seq_printf(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_printf(m, "-");
+}
+
+void __symbol_put(const char *symbol)
+{
+ struct module *owner;
+ unsigned long flags;
+ const unsigned long *crc;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+ if (!__find_symbol(symbol, &owner, &crc, 1))
+ BUG();
+ module_put(owner);
+ spin_unlock_irqrestore(&modlist_lock, flags);
+}
+EXPORT_SYMBOL(__symbol_put);
+
+void symbol_put_addr(void *addr)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+ if (!kernel_text_address((unsigned long)addr))
+ BUG();
+
+ module_put(module_text_address((unsigned long)addr));
+ spin_unlock_irqrestore(&modlist_lock, flags);
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(struct module_attribute *mattr,
+ struct module *mod, char *buffer)
+{
+ /* sysfs holds a reference */
+ return sprintf(buffer, "%u\n", module_refcount(mod)-1);
+}
+
+static struct module_attribute refcnt = {
+ .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },
+ .show = show_refcnt,
+};
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_printf(m, " - -");
+}
+
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static inline int use_module(struct module *a, struct module *b)
+{
+ return strong_try_module_get(b);
+}
+
+static inline void module_unload_init(struct module *mod)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+#ifdef CONFIG_OBSOLETE_MODPARM
+/* Bounds checking done below */
+static int obsparm_copy_string(const char *val, struct kernel_param *kp)
+{
+ strcpy(kp->arg, val);
+ return 0;
+}
+
+int set_obsolete(const char *val, struct kernel_param *kp)
+{
+ unsigned int min, max;
+ unsigned int size, maxsize;
+ int dummy;
+ char *endp;
+ const char *p;
+ struct obsolete_modparm *obsparm = kp->arg;
+
+ if (!val) {
+ printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
+ return -EINVAL;
+ }
+
+ /* type is: [min[-max]]{b,h,i,l,s} */
+ p = obsparm->type;
+ min = simple_strtol(p, &endp, 10);
+ if (endp == obsparm->type)
+ min = max = 1;
+ else if (*endp == '-') {
+ p = endp+1;
+ max = simple_strtol(p, &endp, 10);
+ } else
+ max = min;
+ switch (*endp) {
+ case 'b':
+ return param_array(kp->name, val, min, max, obsparm->addr,
+ 1, param_set_byte, &dummy);
+ case 'h':
+ return param_array(kp->name, val, min, max, obsparm->addr,
+ sizeof(short), param_set_short, &dummy);
+ case 'i':
+ return param_array(kp->name, val, min, max, obsparm->addr,
+ sizeof(int), param_set_int, &dummy);
+ case 'l':
+ return param_array(kp->name, val, min, max, obsparm->addr,
+ sizeof(long), param_set_long, &dummy);
+ case 's':
+ return param_array(kp->name, val, min, max, obsparm->addr,
+ sizeof(char *), param_set_charp, &dummy);
+
+ case 'c':
+ /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
+ and the decl is "char xxx[5][50];" */
+ p = endp+1;
+ maxsize = simple_strtol(p, &endp, 10);
+ /* We check lengths here (yes, this is a hack). */
+ p = val;
+ while (p[size = strcspn(p, ",")]) {
+ if (size >= maxsize)
+ goto oversize;
+ p += size+1;
+ }
+ if (size >= maxsize)
+ goto oversize;
+ return param_array(kp->name, val, min, max, obsparm->addr,
+ maxsize, obsparm_copy_string, &dummy);
+ }
+ printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
+ return -EINVAL;
+ oversize:
+ printk(KERN_ERR
+ "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
+ return -EINVAL;
+}
+
+static int obsolete_params(const char *name,
+ char *args,
+ struct obsolete_modparm obsparm[],
+ unsigned int num,
+ Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ const char *strtab)
+{
+ struct kernel_param *kp;
+ unsigned int i;
+ int ret;
+
+ kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
+ if (!kp)
+ return -ENOMEM;
+
+ for (i = 0; i < num; i++) {
+ char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
+
+ snprintf(sym_name, sizeof(sym_name), "%s%s",
+ MODULE_SYMBOL_PREFIX, obsparm[i].name);
+
+ kp[i].name = obsparm[i].name;
+ kp[i].perm = 000;
+ kp[i].set = set_obsolete;
+ kp[i].get = NULL;
+ obsparm[i].addr
+ = (void *)find_local_symbol(sechdrs, symindex, strtab,
+ sym_name);
+ if (!obsparm[i].addr) {
+ printk("%s: falsely claims to have parameter %s\n",
+ name, obsparm[i].name);
+ ret = -EINVAL;
+ goto out;
+ }
+ kp[i].arg = &obsparm[i];
+ }
+
+ ret = parse_args(name, args, kp, num, NULL);
+ out:
+ kfree(kp);
+ return ret;
+}
+#else
+static int obsolete_params(const char *name,
+ char *args,
+ struct obsolete_modparm obsparm[],
+ unsigned int num,
+ Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ const char *strtab)
+{
+ if (num != 0)
+ printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+ name);
+ return 0;
+}
+#endif /* CONFIG_OBSOLETE_MODPARM */
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+#ifdef CONFIG_MODVERSIONS
+static int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *symname,
+ struct module *mod,
+ const unsigned long *crc)
+{
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ versions = (void *) sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ if (versions[i].crc == *crc)
+ return 1;
+ printk("%s: disagrees about version of symbol %s\n",
+ mod->name, symname);
+ DEBUGP("Found checksum %lX vs module %lX\n",
+ *crc, versions[i].crc);
+ return 0;
+ }
+ /* Not in module's version table. OK, but that taints the kernel. */
+ if (!(tainted & TAINT_FORCED_MODULE)) {
+ printk("%s: no version for \"%s\" found: kernel tainted.\n",
+ mod->name, symname);
+ tainted |= TAINT_FORCED_MODULE;
+ }
+ return 1;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ struct module *mod)
+{
+ const unsigned long *crc;
+ struct module *owner;
+
+ if (!__find_symbol("struct_module", &owner, &crc, 1))
+ BUG();
+ return check_version(sechdrs, versindex, "struct_module", mod,
+ crc);
+}
+
+/* First part is kernel version, which we ignore. */
+static inline int same_magic(const char *amagic, const char *bmagic)
+{
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ return strcmp(amagic, bmagic) == 0;
+}
+#else
+static inline int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *symname,
+ struct module *mod,
+ const unsigned long *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
+
+/* Resolve a symbol for this module. I.e. if we find one, record usage.
+ Must be holding module_mutex. */
+static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *name,
+ struct module *mod)
+{
+ struct module *owner;
+ unsigned long ret;
+ const unsigned long *crc;
+
+ spin_lock_irq(&modlist_lock);
+ ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
+ if (ret) {
+ /* use_module can fail due to OOM, or module unloading */
+ if (!check_version(sechdrs, versindex, name, mod, crc) ||
+ !use_module(mod, owner))
+ ret = 0;
+ }
+ spin_unlock_irq(&modlist_lock);
+ return ret;
+}
+
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+static ssize_t module_sect_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ struct module_sect_attr *sattr =
+ container_of(mattr, struct module_sect_attr, mattr);
+ return sprintf(buf, "0x%lx\n", sattr->address);
+}
+
+static void add_sect_attrs(struct module *mod, unsigned int nsect,
+ char *secstrings, Elf_Shdr *sechdrs)
+{
+ unsigned int nloaded = 0, i, size[2];
+ struct module_sect_attrs *sect_attrs;
+ struct module_sect_attr *sattr;
+ struct attribute **gattr;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < nsect; i++)
+ if (sechdrs[i].sh_flags & SHF_ALLOC)
+ nloaded++;
+ size[0] = ALIGN(sizeof(*sect_attrs)
+ + nloaded * sizeof(sect_attrs->attrs[0]),
+ sizeof(sect_attrs->grp.attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+ if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+ return;
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+
+ sattr = &sect_attrs->attrs[0];
+ gattr = &sect_attrs->grp.attrs[0];
+ for (i = 0; i < nsect; i++) {
+ if (! (sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+ sattr->address = sechdrs[i].sh_addr;
+ strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
+ MODULE_SECT_NAME_LEN);
+ sattr->mattr.show = module_sect_show;
+ sattr->mattr.store = NULL;
+ sattr->mattr.attr.name = sattr->name;
+ sattr->mattr.attr.owner = mod;
+ sattr->mattr.attr.mode = S_IRUGO;
+ *(gattr++) = &(sattr++)->mattr.attr;
+ }
+ *gattr = NULL;
+
+ if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return;
+ out:
+ kfree(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /* We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately. */
+ kfree(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+
+#else
+static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
+ char *sectstrings, Elf_Shdr *sechdrs)
+{
+}
+
+static inline void remove_sect_attrs(struct module *mod)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline int module_add_refcnt_attr(struct module *mod)
+{
+ return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
+}
+static void module_remove_refcnt_attr(struct module *mod)
+{
+ return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
+}
+#else
+static inline int module_add_refcnt_attr(struct module *mod)
+{
+ return 0;
+}
+static void module_remove_refcnt_attr(struct module *mod)
+{
+}
+#endif
+
+
+static int mod_sysfs_setup(struct module *mod,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
+ if (err)
+ goto out;
+ kobj_set_kset_s(&mod->mkobj, module_subsys);
+ mod->mkobj.mod = mod;
+ err = kobject_register(&mod->mkobj.kobj);
+ if (err)
+ goto out;
+
+ err = module_add_refcnt_attr(mod);
+ if (err)
+ goto out_unreg;
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg;
+
+ return 0;
+
+out_unreg:
+ kobject_unregister(&mod->mkobj.kobj);
+out:
+ return err;
+}
+
+static void mod_kobject_remove(struct module *mod)
+{
+ module_remove_refcnt_attr(mod);
+ module_param_sysfs_remove(mod);
+
+ kobject_unregister(&mod->mkobj.kobj);
+}
+
+/*
+ * unlink the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __unlink_module(void *_mod)
+{
+ struct module *mod = _mod;
+ list_del(&mod->list);
+ return 0;
+}
+
+/* Free a module, remove from lists, etc (must hold module mutex). */
+static void free_module(struct module *mod)
+{
+ /* Delete from various lists */
+ stop_machine_run(__unlink_module, mod, NR_CPUS);
+ remove_sect_attrs(mod);
+ mod_kobject_remove(mod);
+
+ /* Arch-specific cleanup. */
+ module_arch_cleanup(mod);
+
+ /* Module unload stuff */
+ module_unload_free(mod);
+
+ /* This may be NULL, but that's OK */
+ module_free(mod, mod->module_init);
+ kfree(mod->args);
+ if (mod->percpu)
+ percpu_modfree(mod->percpu);
+
+ /* Finally, free the core (containing the module structure) */
+ module_free(mod, mod->module_core);
+}
+
+void *__symbol_get(const char *symbol)
+{
+ struct module *owner;
+ unsigned long value, flags;
+ const unsigned long *crc;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+ value = __find_symbol(symbol, &owner, &crc, 1);
+ if (value && !strong_try_module_get(owner))
+ value = 0;
+ spin_unlock_irqrestore(&modlist_lock, flags);
+
+ return (void *)value;
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/* Change all symbols so that sh_value encodes the pointer directly. */
+static int simplify_symbols(Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ const char *strtab,
+ unsigned int versindex,
+ unsigned int pcpuindex,
+ struct module *mod)
+{
+ Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+ unsigned long secbase;
+ unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+ int ret = 0;
+
+ for (i = 1; i < n; i++) {
+ switch (sym[i].st_shndx) {
+ case SHN_COMMON:
+ /* We compiled with -fno-common. These are not
+ supposed to happen. */
+ DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+ printk("%s: please compile with -fno-common\n",
+ mod->name);
+ ret = -ENOEXEC;
+ break;
+
+ case SHN_ABS:
+ /* Don't need to do anything */
+ DEBUGP("Absolute symbol: 0x%08lx\n",
+ (long)sym[i].st_value);
+ break;
+
+ case SHN_UNDEF:
+ sym[i].st_value
+ = resolve_symbol(sechdrs, versindex,
+ strtab + sym[i].st_name, mod);
+
+ /* Ok if resolved. */
+ if (sym[i].st_value != 0)
+ break;
+ /* Ok if weak. */
+ if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ break;
+
+ printk(KERN_WARNING "%s: Unknown symbol %s\n",
+ mod->name, strtab + sym[i].st_name);
+ ret = -ENOENT;
+ break;
+
+ default:
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == pcpuindex)
+ secbase = (unsigned long)mod->percpu;
+ else
+ secbase = sechdrs[sym[i].st_shndx].sh_addr;
+ sym[i].st_value += secbase;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* Update size with this section: return offset. */
+static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+{
+ long ret;
+
+ ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
+ *size = ret + sechdr->sh_size;
+ return ret;
+}
+
+/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ might -- code, read-only data, read-write data, small data. Tally
+ sizes, and place the offsets into sh_entsize fields: high bit means it
+ belongs in init. */
+static void layout_sections(struct module *mod,
+ const Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ static unsigned long const masks[][2] = {
+ /* NOTE: all executable code must be the first section
+ * in this array; otherwise modify the text_size
+ * finder in the two loops below */
+ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ };
+ unsigned int m, i;
+
+ for (i = 0; i < hdr->e_shnum; i++)
+ sechdrs[i].sh_entsize = ~0UL;
+
+ DEBUGP("Core section allocation order:\n");
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ for (i = 0; i < hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &sechdrs[i];
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || strncmp(secstrings + s->sh_name,
+ ".init", 5) == 0)
+ continue;
+ s->sh_entsize = get_offset(&mod->core_size, s);
+ DEBUGP("\t%s\n", secstrings + s->sh_name);
+ }
+ if (m == 0)
+ mod->core_text_size = mod->core_size;
+ }
+
+ DEBUGP("Init section allocation order:\n");
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ for (i = 0; i < hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &sechdrs[i];
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || strncmp(secstrings + s->sh_name,
+ ".init", 5) != 0)
+ continue;
+ s->sh_entsize = (get_offset(&mod->init_size, s)
+ | INIT_OFFSET_MASK);
+ DEBUGP("\t%s\n", secstrings + s->sh_name);
+ }
+ if (m == 0)
+ mod->init_text_size = mod->init_size;
+ }
+}
+
+static inline int license_is_gpl_compatible(const char *license)
+{
+ return (strcmp(license, "GPL") == 0
+ || strcmp(license, "GPL v2") == 0
+ || strcmp(license, "GPL and additional rights") == 0
+ || strcmp(license, "Dual BSD/GPL") == 0
+ || strcmp(license, "Dual MPL/GPL") == 0);
+}
+
+static void set_license(struct module *mod, const char *license)
+{
+ if (!license)
+ license = "unspecified";
+
+ mod->license_gplok = license_is_gpl_compatible(license);
+ if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
+ printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
+ mod->name, license);
+ tainted |= TAINT_PROPRIETARY_MODULE;
+ }
+}
+
+/* Parse tag=value strings from .modinfo section */
+static char *next_string(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
+static char *get_modinfo(Elf_Shdr *sechdrs,
+ unsigned int info,
+ const char *tag)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ unsigned long size = sechdrs[info].sh_size;
+
+ for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_KALLSYMS
+int is_exported(const char *name, const struct module *mod)
+{
+ unsigned int i;
+
+ if (!mod) {
+ for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
+ if (strcmp(__start___ksymtab[i].name, name) == 0)
+ return 1;
+ return 0;
+ }
+ for (i = 0; i < mod->num_syms; i++)
+ if (strcmp(mod->syms[i].name, name) == 0)
+ return 1;
+ return 0;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ struct module *mod)
+{
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
+ && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug", strlen(".debug")) == 0)
+ return 'n';
+ return '?';
+}
+
+static void add_kallsyms(struct module *mod,
+ Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ unsigned int strindex,
+ const char *secstrings)
+{
+ unsigned int i;
+
+ mod->symtab = (void *)sechdrs[symindex].sh_addr;
+ mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+ mod->strtab = (void *)sechdrs[strindex].sh_addr;
+
+ /* Set types up while we still have access to sections. */
+ for (i = 0; i < mod->num_symtab; i++)
+ mod->symtab[i].st_info
+ = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+}
+#else
+static inline void add_kallsyms(struct module *mod,
+ Elf_Shdr *sechdrs,
+ unsigned int symindex,
+ unsigned int strindex,
+ const char *secstrings)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+/* Allocate and load the module: note that size of section 0 is always
+ zero, and we rely on this for optional sections. */
+static struct module *load_module(void __user *umod,
+ unsigned long len,
+ const char __user *uargs)
+{
+ Elf_Ehdr *hdr;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *args, *modmagic, *strtab = NULL;
+ unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
+ exportindex, modindex, obsparmindex, infoindex, gplindex,
+ crcindex, gplcrcindex, versindex, pcpuindex;
+ long arglen;
+ struct module *mod;
+ long err = 0;
+ void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+ struct exception_table_entry *extable;
+
+ DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+ if (len < sizeof(*hdr))
+ return ERR_PTR(-ENOEXEC);
+
+ /* Suck in entire file: we'll want most of it. */
+ /* vmalloc barfs on "unusual" numbers. Check here */
+ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (copy_from_user(hdr, umod, len) != 0) {
+ err = -EFAULT;
+ goto free_hdr;
+ }
+
+ /* Sanity checks against insmoding binaries or wrong arch,
+ weird elf version */
+ if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
+ || hdr->e_type != ET_REL
+ || !elf_check_arch(hdr)
+ || hdr->e_shentsize != sizeof(*sechdrs)) {
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
+ goto truncated;
+
+ /* Convenience variables */
+ sechdrs = (void *)hdr + hdr->e_shoff;
+ secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+ sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_NOBITS
+ && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
+ goto truncated;
+
+ /* Mark all sections sh_addr with their address in the
+ temporary image. */
+ sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+
+ /* Internal symbols and strings. */
+ if (sechdrs[i].sh_type == SHT_SYMTAB) {
+ symindex = i;
+ strindex = sechdrs[i].sh_link;
+ strtab = (char *)hdr + sechdrs[strindex].sh_offset;
+ }
+#ifndef CONFIG_MODULE_UNLOAD
+ /* Don't load .exit sections */
+ if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+ sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+#endif
+ }
+
+ modindex = find_sec(hdr, sechdrs, secstrings,
+ ".gnu.linkonce.this_module");
+ if (!modindex) {
+ printk(KERN_WARNING "No module found in object\n");
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+ mod = (void *)sechdrs[modindex].sh_addr;
+
+ if (symindex == 0) {
+ printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
+ mod->name);
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ /* Optional sections */
+ exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
+ gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+ crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
+ gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+ setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
+ exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
+ obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
+ versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
+ infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
+ pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+
+ /* Don't keep modinfo section */
+ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+#ifdef CONFIG_KALLSYMS
+ /* Keep symbol and string tables for decoding later. */
+ sechdrs[symindex].sh_flags |= SHF_ALLOC;
+ sechdrs[strindex].sh_flags |= SHF_ALLOC;
+#endif
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(sechdrs, versindex, mod)) {
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+ tainted |= TAINT_FORCED_MODULE;
+ printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
+ mod->name);
+ } else if (!same_magic(modmagic, vermagic)) {
+ printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+ mod->name, modmagic, vermagic);
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
+
+ /* Now copy in args */
+ arglen = strlen_user(uargs);
+ if (!arglen) {
+ err = -EFAULT;
+ goto free_hdr;
+ }
+ args = kmalloc(arglen, GFP_KERNEL);
+ if (!args) {
+ err = -ENOMEM;
+ goto free_hdr;
+ }
+ if (copy_from_user(args, uargs, arglen) != 0) {
+ err = -EFAULT;
+ goto free_mod;
+ }
+
+ if (find_module(mod->name)) {
+ err = -EEXIST;
+ goto free_mod;
+ }
+
+ mod->state = MODULE_STATE_COMING;
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
+ if (err < 0)
+ goto free_mod;
+
+ if (pcpuindex) {
+ /* We have a special allocation for this section. */
+ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+ sechdrs[pcpuindex].sh_addralign);
+ if (!percpu) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+ sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ mod->percpu = percpu;
+ }
+
+ /* Determine total sizes, and put offsets in sh_entsize. For now
+ this is done generically; there doesn't appear to be any
+ special cases for the architectures. */
+ layout_sections(mod, hdr, sechdrs, secstrings);
+
+ /* Do the allocs. */
+ ptr = module_alloc(mod->core_size);
+ if (!ptr) {
+ err = -ENOMEM;
+ goto free_percpu;
+ }
+ memset(ptr, 0, mod->core_size);
+ mod->module_core = ptr;
+
+ ptr = module_alloc(mod->init_size);
+ if (!ptr && mod->init_size) {
+ err = -ENOMEM;
+ goto free_core;
+ }
+ memset(ptr, 0, mod->init_size);
+ mod->module_init = ptr;
+
+ /* Transfer each section which specifies SHF_ALLOC */
+ DEBUGP("final section addresses:\n");
+ for (i = 0; i < hdr->e_shnum; i++) {
+ void *dest;
+
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+ dest = mod->module_init
+ + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+ else
+ dest = mod->module_core + sechdrs[i].sh_entsize;
+
+ if (sechdrs[i].sh_type != SHT_NOBITS)
+ memcpy(dest, (void *)sechdrs[i].sh_addr,
+ sechdrs[i].sh_size);
+ /* Update sh_addr to point to copy in image. */
+ sechdrs[i].sh_addr = (unsigned long)dest;
+ DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
+ }
+ /* Module has been moved. */
+ mod = (void *)sechdrs[modindex].sh_addr;
+
+ /* Now we've moved module, initialize linked lists, etc. */
+ module_unload_init(mod);
+
+ /* Set up license info based on the info section */
+ set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
+ mod);
+ if (err < 0)
+ goto cleanup;
+
+ /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
+ mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
+ mod->syms = (void *)sechdrs[exportindex].sh_addr;
+ if (crcindex)
+ mod->crcs = (void *)sechdrs[crcindex].sh_addr;
+ mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
+ mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
+ if (gplcrcindex)
+ mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+
+#ifdef CONFIG_MODVERSIONS
+ if ((mod->num_syms && !crcindex) ||
+ (mod->num_gpl_syms && !gplcrcindex)) {
+ printk(KERN_WARNING "%s: No versions for exported symbols."
+ " Tainting kernel.\n", mod->name);
+ tainted |= TAINT_FORCED_MODULE;
+ }
+#endif
+
+ /* Now do relocations. */
+ for (i = 1; i < hdr->e_shnum; i++) {
+ const char *strtab = (char *)sechdrs[strindex].sh_addr;
+ unsigned int info = sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (info >= hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(sechdrs[info].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(sechdrs, strtab, symindex, i,mod);
+ else if (sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(sechdrs, strtab, symindex, i,
+ mod);
+ if (err < 0)
+ goto cleanup;
+ }
+
+ /* Set up and sort exception table */
+ mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
+ mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
+ sort_extable(extable, extable + mod->num_exentries);
+
+ /* Finally, copy percpu area over. */
+ percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+ sechdrs[pcpuindex].sh_size);
+
+ add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+
+ err = module_finalize(hdr, sechdrs, mod);
+ if (err < 0)
+ goto cleanup;
+
+ mod->args = args;
+ if (obsparmindex) {
+ err = obsolete_params(mod->name, mod->args,
+ (struct obsolete_modparm *)
+ sechdrs[obsparmindex].sh_addr,
+ sechdrs[obsparmindex].sh_size
+ / sizeof(struct obsolete_modparm),
+ sechdrs, symindex,
+ (char *)sechdrs[strindex].sh_addr);
+ if (setupindex)
+ printk(KERN_WARNING "%s: Ignoring new-style "
+ "parameters in presence of obsolete ones\n",
+ mod->name);
+ } else {
+ /* Size of section 0 is 0, so this works well if no params */
+ err = parse_args(mod->name, mod->args,
+ (struct kernel_param *)
+ sechdrs[setupindex].sh_addr,
+ sechdrs[setupindex].sh_size
+ / sizeof(struct kernel_param),
+ NULL);
+ }
+ if (err < 0)
+ goto arch_cleanup;
+
+ err = mod_sysfs_setup(mod,
+ (struct kernel_param *)
+ sechdrs[setupindex].sh_addr,
+ sechdrs[setupindex].sh_size
+ / sizeof(struct kernel_param));
+ if (err < 0)
+ goto arch_cleanup;
+ add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+
+ /* Get rid of temporary copy */
+ vfree(hdr);
+
+ /* Done! */
+ return mod;
+
+ arch_cleanup:
+ module_arch_cleanup(mod);
+ cleanup:
+ module_unload_free(mod);
+ module_free(mod, mod->module_init);
+ free_core:
+ module_free(mod, mod->module_core);
+ free_percpu:
+ if (percpu)
+ percpu_modfree(percpu);
+ free_mod:
+ kfree(args);
+ free_hdr:
+ vfree(hdr);
+ if (err < 0) return ERR_PTR(err);
+ else return ptr;
+
+ truncated:
+ printk(KERN_ERR "Module len %lu truncated\n", len);
+ err = -ENOEXEC;
+ goto free_hdr;
+}
+
+/*
+ * link the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __link_module(void *_mod)
+{
+ struct module *mod = _mod;
+ list_add(&mod->list, &modules);
+ return 0;
+}
+
+/* This is where the real work happens */
+asmlinkage long
+sys_init_module(void __user *umod,
+ unsigned long len,
+ const char __user *uargs)
+{
+ struct module *mod;
+ int ret = 0;
+
+ /* Must have permission */
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+
+ /* Only one module load at a time, please */
+ if (down_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ /* Do all the hard work */
+ mod = load_module(umod, len, uargs);
+ if (IS_ERR(mod)) {
+ up(&module_mutex);
+ return PTR_ERR(mod);
+ }
+
+ /* Flush the instruction cache, since we've played with text */
+ if (mod->module_init)
+ flush_icache_range((unsigned long)mod->module_init,
+ (unsigned long)mod->module_init
+ + mod->init_size);
+ flush_icache_range((unsigned long)mod->module_core,
+ (unsigned long)mod->module_core + mod->core_size);
+
+ /* Now sew it into the lists. They won't access us, since
+ strong_try_module_get() will fail. */
+ stop_machine_run(__link_module, mod, NR_CPUS);
+
+ /* Drop lock so they can recurse */
+ up(&module_mutex);
+
+ down(&notify_mutex);
+ notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
+ up(&notify_mutex);
+
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = mod->init();
+ if (ret < 0) {
+ /* Init routine failed: abort. Try to protect us from
+ buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_kernel();
+ if (mod->unsafe)
+ printk(KERN_ERR "%s: module is now stuck!\n",
+ mod->name);
+ else {
+ module_put(mod);
+ down(&module_mutex);
+ free_module(mod);
+ up(&module_mutex);
+ }
+ return ret;
+ }
+
+ /* Now it's a first class citizen! */
+ down(&module_mutex);
+ mod->state = MODULE_STATE_LIVE;
+ /* Drop initial reference. */
+ module_put(mod);
+ module_free(mod, mod->module_init);
+ mod->module_init = NULL;
+ mod->init_size = 0;
+ mod->init_text_size = 0;
+ up(&module_mutex);
+
+ return 0;
+}
+
+static inline int within(unsigned long addr, void *start, unsigned long size)
+{
+ return ((void *)addr >= start && (void *)addr < start + size);
+}
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+ return str[0] == '$' && strchr("atd", str[1])
+ && (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *get_ksymbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval;
+
+ /* At worse, next value is at end of module */
+ if (within(addr, mod->module_init, mod->init_size))
+ nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ else
+ nextval = (unsigned long)mod->module_core+mod->core_text_size;
+
+ /* Scan for closest preceeding symbol, and next symbol. (ELF
+ starts real symbols at 1). */
+ for (i = 1; i < mod->num_symtab; i++) {
+ if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ continue;
+
+ /* We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim. */
+ if (mod->symtab[i].st_value <= addr
+ && mod->symtab[i].st_value > mod->symtab[best].st_value
+ && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+ && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ best = i;
+ if (mod->symtab[i].st_value > addr
+ && mod->symtab[i].st_value < nextval
+ && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+ && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ nextval = mod->symtab[i].st_value;
+ }
+
+ if (!best)
+ return NULL;
+
+ *size = nextval - mod->symtab[best].st_value;
+ *offset = addr - mod->symtab[best].st_value;
+ return mod->strtab + mod->symtab[best].st_name;
+}
+
+/* For kallsyms to ask for address resolution. NULL means not found.
+ We don't lock, as this is used for oops resolution and races are a
+ lesser concern. */
+const char *module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname)
+{
+ struct module *mod;
+
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size)
+ || within(addr, mod->module_core, mod->core_size)) {
+ *modname = mod->name;
+ return get_ksymbol(mod, addr, size, offset);
+ }
+ }
+ return NULL;
+}
+
+struct module *module_get_kallsym(unsigned int symnum,
+ unsigned long *value,
+ char *type,
+ char namebuf[128])
+{
+ struct module *mod;
+
+ down(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (symnum < mod->num_symtab) {
+ *value = mod->symtab[symnum].st_value;
+ *type = mod->symtab[symnum].st_info;
+ strncpy(namebuf,
+ mod->strtab + mod->symtab[symnum].st_name,
+ 127);
+ up(&module_mutex);
+ return mod;
+ }
+ symnum -= mod->num_symtab;
+ }
+ up(&module_mutex);
+ return NULL;
+}
+
+static unsigned long mod_find_symname(struct module *mod, const char *name)
+{
+ unsigned int i;
+
+ for (i = 0; i < mod->num_symtab; i++)
+ if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)
+ return mod->symtab[i].st_value;
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+ unsigned long ret = 0;
+
+ /* Don't lock: we're in enough trouble already. */
+ if ((colon = strchr(name, ':')) != NULL) {
+ *colon = '\0';
+ if ((mod = find_module(name)) != NULL)
+ ret = mod_find_symname(mod, colon+1);
+ *colon = ':';
+ } else {
+ list_for_each_entry(mod, &modules, list)
+ if ((ret = mod_find_symname(mod, name)) != 0)
+ break;
+ }
+ return ret;
+}
+#endif /* CONFIG_KALLSYMS */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ struct list_head *i;
+ loff_t n = 0;
+
+ down(&module_mutex);
+ list_for_each(i, &modules) {
+ if (n++ == *pos)
+ break;
+ }
+ if (i == &modules)
+ return NULL;
+ return i;
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct list_head *i = p;
+ (*pos)++;
+ if (i->next == &modules)
+ return NULL;
+ return i->next;
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ up(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ seq_printf(m, "%s %lu",
+ mod->name, mod->init_size + mod->core_size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading":
+ mod->state == MODULE_STATE_COMING ? "Loading":
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ seq_printf(m, " 0x%p", mod->module_core);
+
+ seq_printf(m, "\n");
+ return 0;
+}
+
+/* Format: modulename size refcount deps address
+
+ Where refcount is a number or -, and deps is a comma-separated list
+ of depends or -.
+*/
+struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+ unsigned long flags;
+ const struct exception_table_entry *e = NULL;
+ struct module *mod;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->num_exentries == 0)
+ continue;
+
+ e = search_extable(mod->extable,
+ mod->extable + mod->num_exentries - 1,
+ addr);
+ if (e)
+ break;
+ }
+ spin_unlock_irqrestore(&modlist_lock, flags);
+
+ /* Now, if we found one, we are running inside it now, hence
+ we cannot unload the module, hence no refcnt needed. */
+ return e;
+}
+
+/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod;
+
+ list_for_each_entry(mod, &modules, list)
+ if (within(addr, mod->module_init, mod->init_text_size)
+ || within(addr, mod->module_core, mod->core_text_size))
+ return mod;
+ return NULL;
+}
+
+struct module *module_text_address(unsigned long addr)
+{
+ struct module *mod;
+ unsigned long flags;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+ mod = __module_text_address(addr);
+ spin_unlock_irqrestore(&modlist_lock, flags);
+
+ return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+ struct module *mod;
+
+ printk("Modules linked in:");
+ list_for_each_entry(mod, &modules, list)
+ printk(" %s", mod->name);
+ printk("\n");
+}
+
+void module_add_driver(struct module *mod, struct device_driver *drv)
+{
+ if (!mod || !drv)
+ return;
+
+ /* Don't check return code; this call is idempotent */
+ sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+}
+EXPORT_SYMBOL(module_add_driver);
+
+void module_remove_driver(struct device_driver *drv)
+{
+ if (!drv)
+ return;
+ sysfs_remove_link(&drv->kobj, "module");
+}
+EXPORT_SYMBOL(module_remove_driver);
+
+#ifdef CONFIG_MODVERSIONS
+/* Generate the signature for struct module here, too, for modversions. */
+void struct_module(struct module *mod) { return; }
+EXPORT_SYMBOL(struct_module);
+#endif
diff --git a/kernel/panic.c b/kernel/panic.c
new file mode 100644
index 000000000000..0fa3f3a66fb6
--- /dev/null
+++ b/kernel/panic.c
@@ -0,0 +1,157 @@
+/*
+ * linux/kernel/panic.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * This function is used through-out the kernel (including mm and fs)
+ * to indicate a major problem.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+
+int panic_timeout;
+int panic_on_oops;
+int tainted;
+
+EXPORT_SYMBOL(panic_timeout);
+
+struct notifier_block *panic_notifier_list;
+
+EXPORT_SYMBOL(panic_notifier_list);
+
+static int __init panic_setup(char *str)
+{
+ panic_timeout = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("panic=", panic_setup);
+
+static long no_blink(long time)
+{
+ return 0;
+}
+
+/* Returns how long it waited in ms */
+long (*panic_blink)(long time);
+EXPORT_SYMBOL(panic_blink);
+
+/**
+ * panic - halt the system
+ * @fmt: The text string to print
+ *
+ * Display a message, then perform cleanups.
+ *
+ * This function never returns.
+ */
+
+NORET_TYPE void panic(const char * fmt, ...)
+{
+ long i;
+ static char buf[1024];
+ va_list args;
+#if defined(CONFIG_ARCH_S390)
+ unsigned long caller = (unsigned long) __builtin_return_address(0);
+#endif
+
+ bust_spinlocks(1);
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ bust_spinlocks(0);
+
+#ifdef CONFIG_SMP
+ smp_send_stop();
+#endif
+
+ notifier_call_chain(&panic_notifier_list, 0, buf);
+
+ if (!panic_blink)
+ panic_blink = no_blink;
+
+ if (panic_timeout > 0)
+ {
+ /*
+ * Delay timeout seconds before rebooting the machine.
+ * We can't use the "normal" timers since we just panicked..
+ */
+ printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+ for (i = 0; i < panic_timeout*1000; ) {
+ touch_nmi_watchdog();
+ i += panic_blink(i);
+ mdelay(1);
+ i++;
+ }
+ /*
+ * Should we run the reboot notifier. For the moment Im
+ * choosing not too. It might crash, be corrupt or do
+ * more harm than good for other reasons.
+ */
+ machine_restart(NULL);
+ }
+#ifdef __sparc__
+ {
+ extern int stop_a_enabled;
+ /* Make sure the user can actually press L1-A */
+ stop_a_enabled = 1;
+ printk(KERN_EMERG "Press L1-A to return to the boot prom\n");
+ }
+#endif
+#if defined(CONFIG_ARCH_S390)
+ disabled_wait(caller);
+#endif
+ local_irq_enable();
+ for (i = 0;;) {
+ i += panic_blink(i);
+ mdelay(1);
+ i++;
+ }
+}
+
+EXPORT_SYMBOL(panic);
+
+/**
+ * print_tainted - return a string to represent the kernel taint state.
+ *
+ * 'P' - Proprietary module has been loaded.
+ * 'F' - Module has been forcibly loaded.
+ * 'S' - SMP with CPUs not designed for SMP.
+ * 'R' - User forced a module unload.
+ * 'M' - Machine had a machine check experience.
+ * 'B' - System has hit bad_page.
+ *
+ * The string is overwritten by the next call to print_taint().
+ */
+
+const char *print_tainted(void)
+{
+ static char buf[20];
+ if (tainted) {
+ snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c",
+ tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
+ tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
+ tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
+ tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
+ tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
+ tainted & TAINT_BAD_PAGE ? 'B' : ' ');
+ }
+ else
+ snprintf(buf, sizeof(buf), "Not tainted");
+ return(buf);
+}
+
+void add_taint(unsigned flag)
+{
+ tainted |= flag;
+}
+EXPORT_SYMBOL(add_taint);
diff --git a/kernel/params.c b/kernel/params.c
new file mode 100644
index 000000000000..5538608bd339
--- /dev/null
+++ b/kernel/params.c
@@ -0,0 +1,721 @@
+/* Helpers for initial module or kernel cmdline parsing
+ Copyright (C) 2001 Rusty Russell.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/config.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt, a...)
+#endif
+
+static inline int dash2underscore(char c)
+{
+ if (c == '-')
+ return '_';
+ return c;
+}
+
+static inline int parameq(const char *input, const char *paramname)
+{
+ unsigned int i;
+ for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
+ if (input[i] == '\0')
+ return 1;
+ return 0;
+}
+
+static int parse_one(char *param,
+ char *val,
+ struct kernel_param *params,
+ unsigned num_params,
+ int (*handle_unknown)(char *param, char *val))
+{
+ unsigned int i;
+
+ /* Find parameter */
+ for (i = 0; i < num_params; i++) {
+ if (parameq(param, params[i].name)) {
+ DEBUGP("They are equal! Calling %p\n",
+ params[i].set);
+ return params[i].set(val, &params[i]);
+ }
+ }
+
+ if (handle_unknown) {
+ DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+ return handle_unknown(param, val);
+ }
+
+ DEBUGP("Unknown argument `%s'\n", param);
+ return -ENOENT;
+}
+
+/* You can use " around spaces, but can't escape ". */
+/* Hyphens and underscores equivalent in parameter names. */
+static char *next_arg(char *args, char **param, char **val)
+{
+ unsigned int i, equals = 0;
+ int in_quote = 0, quoted = 0;
+ char *next;
+
+ /* Chew any extra spaces */
+ while (*args == ' ') args++;
+ if (*args == '"') {
+ args++;
+ in_quote = 1;
+ quoted = 1;
+ }
+
+ for (i = 0; args[i]; i++) {
+ if (args[i] == ' ' && !in_quote)
+ break;
+ if (equals == 0) {
+ if (args[i] == '=')
+ equals = i;
+ }
+ if (args[i] == '"')
+ in_quote = !in_quote;
+ }
+
+ *param = args;
+ if (!equals)
+ *val = NULL;
+ else {
+ args[equals] = '\0';
+ *val = args + equals + 1;
+
+ /* Don't include quotes in value. */
+ if (**val == '"') {
+ (*val)++;
+ if (args[i-1] == '"')
+ args[i-1] = '\0';
+ }
+ if (quoted && args[i-1] == '"')
+ args[i-1] = '\0';
+ }
+
+ if (args[i]) {
+ args[i] = '\0';
+ next = args + i + 1;
+ } else
+ next = args + i;
+ return next;
+}
+
+/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
+int parse_args(const char *name,
+ char *args,
+ struct kernel_param *params,
+ unsigned num,
+ int (*unknown)(char *param, char *val))
+{
+ char *param, *val;
+
+ DEBUGP("Parsing ARGS: %s\n", args);
+
+ while (*args) {
+ int ret;
+
+ args = next_arg(args, &param, &val);
+ ret = parse_one(param, val, params, num, unknown);
+ switch (ret) {
+ case -ENOENT:
+ printk(KERN_ERR "%s: Unknown parameter `%s'\n",
+ name, param);
+ return ret;
+ case -ENOSPC:
+ printk(KERN_ERR
+ "%s: `%s' too large for parameter `%s'\n",
+ name, val ?: "", param);
+ return ret;
+ case 0:
+ break;
+ default:
+ printk(KERN_ERR
+ "%s: `%s' invalid for parameter `%s'\n",
+ name, val ?: "", param);
+ return ret;
+ }
+ }
+
+ /* All parsed OK. */
+ return 0;
+}
+
+/* Lazy bastard, eh? */
+#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
+ int param_set_##name(const char *val, struct kernel_param *kp) \
+ { \
+ char *endp; \
+ tmptype l; \
+ \
+ if (!val) return -EINVAL; \
+ l = strtolfn(val, &endp, 0); \
+ if (endp == val || ((type)l != l)) \
+ return -EINVAL; \
+ *((type *)kp->arg) = l; \
+ return 0; \
+ } \
+ int param_get_##name(char *buffer, struct kernel_param *kp) \
+ { \
+ return sprintf(buffer, format, *((type *)kp->arg)); \
+ }
+
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
+
+int param_set_charp(const char *val, struct kernel_param *kp)
+{
+ if (!val) {
+ printk(KERN_ERR "%s: string parameter expected\n",
+ kp->name);
+ return -EINVAL;
+ }
+
+ if (strlen(val) > 1024) {
+ printk(KERN_ERR "%s: string parameter too long\n",
+ kp->name);
+ return -ENOSPC;
+ }
+
+ *(char **)kp->arg = (char *)val;
+ return 0;
+}
+
+int param_get_charp(char *buffer, struct kernel_param *kp)
+{
+ return sprintf(buffer, "%s", *((char **)kp->arg));
+}
+
+int param_set_bool(const char *val, struct kernel_param *kp)
+{
+ /* No equals means "set"... */
+ if (!val) val = "1";
+
+ /* One of =[yYnN01] */
+ switch (val[0]) {
+ case 'y': case 'Y': case '1':
+ *(int *)kp->arg = 1;
+ return 0;
+ case 'n': case 'N': case '0':
+ *(int *)kp->arg = 0;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+int param_get_bool(char *buffer, struct kernel_param *kp)
+{
+ /* Y and N chosen as being relatively non-coder friendly */
+ return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+}
+
+int param_set_invbool(const char *val, struct kernel_param *kp)
+{
+ int boolval, ret;
+ struct kernel_param dummy = { .arg = &boolval };
+
+ ret = param_set_bool(val, &dummy);
+ if (ret == 0)
+ *(int *)kp->arg = !boolval;
+ return ret;
+}
+
+int param_get_invbool(char *buffer, struct kernel_param *kp)
+{
+ int val;
+ struct kernel_param dummy = { .arg = &val };
+
+ val = !*(int *)kp->arg;
+ return param_get_bool(buffer, &dummy);
+}
+
+/* We cheat here and temporarily mangle the string. */
+int param_array(const char *name,
+ const char *val,
+ unsigned int min, unsigned int max,
+ void *elem, int elemsize,
+ int (*set)(const char *, struct kernel_param *kp),
+ int *num)
+{
+ int ret;
+ struct kernel_param kp;
+ char save;
+
+ /* Get the name right for errors. */
+ kp.name = name;
+ kp.arg = elem;
+
+ /* No equals sign? */
+ if (!val) {
+ printk(KERN_ERR "%s: expects arguments\n", name);
+ return -EINVAL;
+ }
+
+ *num = 0;
+ /* We expect a comma-separated list of values. */
+ do {
+ int len;
+
+ if (*num == max) {
+ printk(KERN_ERR "%s: can only take %i arguments\n",
+ name, max);
+ return -EINVAL;
+ }
+ len = strcspn(val, ",");
+
+ /* nul-terminate and parse */
+ save = val[len];
+ ((char *)val)[len] = '\0';
+ ret = set(val, &kp);
+
+ if (ret != 0)
+ return ret;
+ kp.arg += elemsize;
+ val += len+1;
+ (*num)++;
+ } while (save == ',');
+
+ if (*num < min) {
+ printk(KERN_ERR "%s: needs at least %i arguments\n",
+ name, min);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int param_array_set(const char *val, struct kernel_param *kp)
+{
+ struct kparam_array *arr = kp->arg;
+
+ return param_array(kp->name, val, 1, arr->max, arr->elem,
+ arr->elemsize, arr->set, arr->num ?: &arr->max);
+}
+
+int param_array_get(char *buffer, struct kernel_param *kp)
+{
+ int i, off, ret;
+ struct kparam_array *arr = kp->arg;
+ struct kernel_param p;
+
+ p = *kp;
+ for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ if (i)
+ buffer[off++] = ',';
+ p.arg = arr->elem + arr->elemsize * i;
+ ret = arr->get(buffer + off, &p);
+ if (ret < 0)
+ return ret;
+ off += ret;
+ }
+ buffer[off] = '\0';
+ return off;
+}
+
+int param_set_copystring(const char *val, struct kernel_param *kp)
+{
+ struct kparam_string *kps = kp->arg;
+
+ if (strlen(val)+1 > kps->maxlen) {
+ printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+ kp->name, kps->maxlen-1);
+ return -ENOSPC;
+ }
+ strcpy(kps->string, val);
+ return 0;
+}
+
+int param_get_string(char *buffer, struct kernel_param *kp)
+{
+ struct kparam_string *kps = kp->arg;
+ return strlcpy(buffer, kps->string, kps->maxlen);
+}
+
+/* sysfs output in /sys/modules/XYZ/parameters/ */
+
+extern struct kernel_param __start___param[], __stop___param[];
+
+#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
+
+struct param_attribute
+{
+ struct module_attribute mattr;
+ struct kernel_param *param;
+};
+
+struct module_param_attrs
+{
+ struct attribute_group grp;
+ struct param_attribute attrs[0];
+};
+
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+
+static ssize_t param_attr_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ int count;
+ struct param_attribute *attribute = to_param_attr(mattr);
+
+ if (!attribute->param->get)
+ return -EPERM;
+
+ count = attribute->param->get(buf, attribute->param);
+ if (count > 0) {
+ strcat(buf, "\n");
+ ++count;
+ }
+ return count;
+}
+
+/* sysfs always hands a nul-terminated string in buf. We rely on that. */
+static ssize_t param_attr_store(struct module_attribute *mattr,
+ struct module *owner,
+ const char *buf, size_t len)
+{
+ int err;
+ struct param_attribute *attribute = to_param_attr(mattr);
+
+ if (!attribute->param->set)
+ return -EPERM;
+
+ err = attribute->param->set(buf, attribute->param);
+ if (!err)
+ return len;
+ return err;
+}
+
+#ifdef CONFIG_MODULES
+#define __modinit
+#else
+#define __modinit __init
+#endif
+
+/*
+ * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
+ * @mk: struct module_kobject (contains parent kobject)
+ * @kparam: array of struct kernel_param, the actual parameter definitions
+ * @num_params: number of entries in array
+ * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
+ *
+ * Create a kobject for a (per-module) group of parameters, and create files
+ * in sysfs. A pointer to the param_kobject is returned on success,
+ * NULL if there's no parameter to export, or other ERR_PTR(err).
+ */
+static __modinit struct module_param_attrs *
+param_sysfs_setup(struct module_kobject *mk,
+ struct kernel_param *kparam,
+ unsigned int num_params,
+ unsigned int name_skip)
+{
+ struct module_param_attrs *mp;
+ unsigned int valid_attrs = 0;
+ unsigned int i, size[2];
+ struct param_attribute *pattr;
+ struct attribute **gattr;
+ int err;
+
+ for (i=0; i<num_params; i++) {
+ if (kparam[i].perm)
+ valid_attrs++;
+ }
+
+ if (!valid_attrs)
+ return NULL;
+
+ size[0] = ALIGN(sizeof(*mp) +
+ valid_attrs * sizeof(mp->attrs[0]),
+ sizeof(mp->grp.attrs[0]));
+ size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
+
+ mp = kmalloc(size[0] + size[1], GFP_KERNEL);
+ if (!mp)
+ return ERR_PTR(-ENOMEM);
+
+ mp->grp.name = "parameters";
+ mp->grp.attrs = (void *)mp + size[0];
+
+ pattr = &mp->attrs[0];
+ gattr = &mp->grp.attrs[0];
+ for (i = 0; i < num_params; i++) {
+ struct kernel_param *kp = &kparam[i];
+ if (kp->perm) {
+ pattr->param = kp;
+ pattr->mattr.show = param_attr_show;
+ pattr->mattr.store = param_attr_store;
+ pattr->mattr.attr.name = (char *)&kp->name[name_skip];
+ pattr->mattr.attr.owner = mk->mod;
+ pattr->mattr.attr.mode = kp->perm;
+ *(gattr++) = &(pattr++)->mattr.attr;
+ }
+ }
+ *gattr = NULL;
+
+ if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
+ kfree(mp);
+ return ERR_PTR(err);
+ }
+ return mp;
+}
+
+
+#ifdef CONFIG_MODULES
+
+/*
+ * module_param_sysfs_setup - setup sysfs support for one module
+ * @mod: module
+ * @kparam: module parameters (array)
+ * @num_params: number of module parameters
+ *
+ * Adds sysfs entries for module parameters, and creates a link from
+ * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
+ */
+int module_param_sysfs_setup(struct module *mod,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ struct module_param_attrs *mp;
+
+ mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
+ if (IS_ERR(mp))
+ return PTR_ERR(mp);
+
+ mod->param_attrs = mp;
+ return 0;
+}
+
+/*
+ * module_param_sysfs_remove - remove sysfs support for one module
+ * @mod: module
+ *
+ * Remove sysfs entries for module parameters and the corresponding
+ * kobject.
+ */
+void module_param_sysfs_remove(struct module *mod)
+{
+ if (mod->param_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->param_attrs->grp);
+ /* We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately. */
+ kfree(mod->param_attrs);
+ mod->param_attrs = NULL;
+ }
+}
+#endif
+
+/*
+ * kernel_param_sysfs_setup - wrapper for built-in params support
+ */
+static void __init kernel_param_sysfs_setup(const char *name,
+ struct kernel_param *kparam,
+ unsigned int num_params,
+ unsigned int name_skip)
+{
+ struct module_kobject *mk;
+
+ mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ memset(mk, 0, sizeof(struct module_kobject));
+
+ mk->mod = THIS_MODULE;
+ kobj_set_kset_s(mk, module_subsys);
+ kobject_set_name(&mk->kobj, name);
+ kobject_register(&mk->kobj);
+
+ /* no need to keep the kobject if no parameter is exported */
+ if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
+ kobject_unregister(&mk->kobj);
+ kfree(mk);
+ }
+}
+
+/*
+ * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
+ *
+ * Add module_parameters to sysfs for "modules" built into the kernel.
+ *
+ * The "module" name (KBUILD_MODNAME) is stored before a dot, the
+ * "parameter" name is stored behind a dot in kernel_param->name. So,
+ * extract the "module" name for all built-in kernel_param-eters,
+ * and for all who have the same, call kernel_param_sysfs_setup.
+ */
+static void __init param_sysfs_builtin(void)
+{
+ struct kernel_param *kp, *kp_begin = NULL;
+ unsigned int i, name_len, count = 0;
+ char modname[MAX_KBUILD_MODNAME + 1] = "";
+
+ for (i=0; i < __stop___param - __start___param; i++) {
+ char *dot;
+
+ kp = &__start___param[i];
+
+ /* We do not handle args without periods. */
+ dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME);
+ if (!dot) {
+ DEBUGP("couldn't find period in %s\n", kp->name);
+ continue;
+ }
+ name_len = dot - kp->name;
+
+ /* new kbuild_modname? */
+ if (strlen(modname) != name_len
+ || strncmp(modname, kp->name, name_len) != 0) {
+ /* add a new kobject for previous kernel_params. */
+ if (count)
+ kernel_param_sysfs_setup(modname,
+ kp_begin,
+ count,
+ strlen(modname)+1);
+
+ strncpy(modname, kp->name, name_len);
+ modname[name_len] = '\0';
+ count = 0;
+ kp_begin = kp;
+ }
+ count++;
+ }
+
+ /* last kernel_params need to be registered as well */
+ if (count)
+ kernel_param_sysfs_setup(modname, kp_begin, count,
+ strlen(modname)+1);
+}
+
+
+/* module-related sysfs stuff */
+#ifdef CONFIG_MODULES
+
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+
+static ssize_t module_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct module_attribute *attribute;
+ struct module_kobject *mk;
+ int ret;
+
+ attribute = to_module_attr(attr);
+ mk = to_module_kobject(kobj);
+
+ if (!attribute->show)
+ return -EPERM;
+
+ if (!try_module_get(mk->mod))
+ return -ENODEV;
+
+ ret = attribute->show(attribute, mk->mod, buf);
+
+ module_put(mk->mod);
+
+ return ret;
+}
+
+static ssize_t module_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct module_attribute *attribute;
+ struct module_kobject *mk;
+ int ret;
+
+ attribute = to_module_attr(attr);
+ mk = to_module_kobject(kobj);
+
+ if (!attribute->store)
+ return -EPERM;
+
+ if (!try_module_get(mk->mod))
+ return -ENODEV;
+
+ ret = attribute->store(attribute, mk->mod, buf, len);
+
+ module_put(mk->mod);
+
+ return ret;
+}
+
+static struct sysfs_ops module_sysfs_ops = {
+ .show = module_attr_show,
+ .store = module_attr_store,
+};
+
+#else
+static struct sysfs_ops module_sysfs_ops = {
+ .show = NULL,
+ .store = NULL,
+};
+#endif
+
+static struct kobj_type module_ktype = {
+ .sysfs_ops = &module_sysfs_ops,
+};
+
+decl_subsys(module, &module_ktype, NULL);
+
+/*
+ * param_sysfs_init - wrapper for built-in params support
+ */
+static int __init param_sysfs_init(void)
+{
+ subsystem_register(&module_subsys);
+
+ param_sysfs_builtin();
+
+ return 0;
+}
+__initcall(param_sysfs_init);
+
+EXPORT_SYMBOL(param_set_byte);
+EXPORT_SYMBOL(param_get_byte);
+EXPORT_SYMBOL(param_set_short);
+EXPORT_SYMBOL(param_get_short);
+EXPORT_SYMBOL(param_set_ushort);
+EXPORT_SYMBOL(param_get_ushort);
+EXPORT_SYMBOL(param_set_int);
+EXPORT_SYMBOL(param_get_int);
+EXPORT_SYMBOL(param_set_uint);
+EXPORT_SYMBOL(param_get_uint);
+EXPORT_SYMBOL(param_set_long);
+EXPORT_SYMBOL(param_get_long);
+EXPORT_SYMBOL(param_set_ulong);
+EXPORT_SYMBOL(param_get_ulong);
+EXPORT_SYMBOL(param_set_charp);
+EXPORT_SYMBOL(param_get_charp);
+EXPORT_SYMBOL(param_set_bool);
+EXPORT_SYMBOL(param_get_bool);
+EXPORT_SYMBOL(param_set_invbool);
+EXPORT_SYMBOL(param_get_invbool);
+EXPORT_SYMBOL(param_array_set);
+EXPORT_SYMBOL(param_array_get);
+EXPORT_SYMBOL(param_set_copystring);
+EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/pid.c b/kernel/pid.c
new file mode 100644
index 000000000000..edba31c681ac
--- /dev/null
+++ b/kernel/pid.c
@@ -0,0 +1,292 @@
+/*
+ * Generic pidhash and scalable, time-bounded PID allocator
+ *
+ * (C) 2002-2003 William Irwin, IBM
+ * (C) 2004 William Irwin, Oracle
+ * (C) 2002-2004 Ingo Molnar, Red Hat
+ *
+ * pid-structures are backing objects for tasks sharing a given ID to chain
+ * against. There is very little to them aside from hashing them and
+ * parking tasks using given ID's on a list.
+ *
+ * The hash is always changed with the tasklist_lock write-acquired,
+ * and the hash is only accessed with the tasklist_lock at least
+ * read-acquired, so there's no additional SMP locking needed here.
+ *
+ * We have a list of bitmap pages, which bitmaps represent the PID space.
+ * Allocating and freeing PIDs is completely lockless. The worst-case
+ * allocation scenario when all but one out of 1 million PIDs possible are
+ * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
+ * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+
+#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
+static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static int pidhash_shift;
+
+int pid_max = PID_MAX_DEFAULT;
+int last_pid;
+
+#define RESERVED_PIDS 300
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
+
+#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
+#define BITS_PER_PAGE (PAGE_SIZE*8)
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
+#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off))
+#define find_next_offset(map, off) \
+ find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+
+/*
+ * PID-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way a low pid_max
+ * value does not cause lots of bitmaps to be allocated, but
+ * the scheme scales to up to 4 million PIDs, runtime.
+ */
+typedef struct pidmap {
+ atomic_t nr_free;
+ void *page;
+} pidmap_t;
+
+static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
+ { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+
+fastcall void free_pidmap(int pid)
+{
+ pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
+ int offset = pid & BITS_PER_PAGE_MASK;
+
+ clear_bit(offset, map->page);
+ atomic_inc(&map->nr_free);
+}
+
+int alloc_pidmap(void)
+{
+ int i, offset, max_scan, pid, last = last_pid;
+ pidmap_t *map;
+
+ pid = last + 1;
+ if (pid >= pid_max)
+ pid = RESERVED_PIDS;
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pidmap_array[pid/BITS_PER_PAGE];
+ max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+ for (i = 0; i <= max_scan; ++i) {
+ if (unlikely(!map->page)) {
+ unsigned long page = get_zeroed_page(GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock(&pidmap_lock);
+ if (map->page)
+ free_page(page);
+ else
+ map->page = (void *)page;
+ spin_unlock(&pidmap_lock);
+ if (unlikely(!map->page))
+ break;
+ }
+ if (likely(atomic_read(&map->nr_free))) {
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ last_pid = pid;
+ return pid;
+ }
+ offset = find_next_offset(map, offset);
+ pid = mk_pid(map, offset);
+ /*
+ * find_next_offset() found a bit, the pid from it
+ * is in-bounds, and if we fell back to the last
+ * bitmap block and the final block was the same
+ * as the starting point, pid is before last_pid.
+ */
+ } while (offset < BITS_PER_PAGE && pid < pid_max &&
+ (i != max_scan || pid < last ||
+ !((last+1) & BITS_PER_PAGE_MASK)));
+ }
+ if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
+ ++map;
+ offset = 0;
+ } else {
+ map = &pidmap_array[0];
+ offset = RESERVED_PIDS;
+ if (unlikely(last == offset))
+ break;
+ }
+ pid = mk_pid(map, offset);
+ }
+ return -1;
+}
+
+struct pid * fastcall find_pid(enum pid_type type, int nr)
+{
+ struct hlist_node *elem;
+ struct pid *pid;
+
+ hlist_for_each_entry(pid, elem,
+ &pid_hash[type][pid_hashfn(nr)], pid_chain) {
+ if (pid->nr == nr)
+ return pid;
+ }
+ return NULL;
+}
+
+int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
+{
+ struct pid *pid, *task_pid;
+
+ task_pid = &task->pids[type];
+ pid = find_pid(type, nr);
+ if (pid == NULL) {
+ hlist_add_head(&task_pid->pid_chain,
+ &pid_hash[type][pid_hashfn(nr)]);
+ INIT_LIST_HEAD(&task_pid->pid_list);
+ } else {
+ INIT_HLIST_NODE(&task_pid->pid_chain);
+ list_add_tail(&task_pid->pid_list, &pid->pid_list);
+ }
+ task_pid->nr = nr;
+
+ return 0;
+}
+
+static fastcall int __detach_pid(task_t *task, enum pid_type type)
+{
+ struct pid *pid, *pid_next;
+ int nr = 0;
+
+ pid = &task->pids[type];
+ if (!hlist_unhashed(&pid->pid_chain)) {
+ hlist_del(&pid->pid_chain);
+
+ if (list_empty(&pid->pid_list))
+ nr = pid->nr;
+ else {
+ pid_next = list_entry(pid->pid_list.next,
+ struct pid, pid_list);
+ /* insert next pid from pid_list to hash */
+ hlist_add_head(&pid_next->pid_chain,
+ &pid_hash[type][pid_hashfn(pid_next->nr)]);
+ }
+ }
+
+ list_del(&pid->pid_list);
+ pid->nr = 0;
+
+ return nr;
+}
+
+void fastcall detach_pid(task_t *task, enum pid_type type)
+{
+ int tmp, nr;
+
+ nr = __detach_pid(task, type);
+ if (!nr)
+ return;
+
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (tmp != type && find_pid(tmp, nr))
+ return;
+
+ free_pidmap(nr);
+}
+
+task_t *find_task_by_pid_type(int type, int nr)
+{
+ struct pid *pid;
+
+ pid = find_pid(type, nr);
+ if (!pid)
+ return NULL;
+
+ return pid_task(&pid->pid_list, type);
+}
+
+EXPORT_SYMBOL(find_task_by_pid_type);
+
+/*
+ * This function switches the PIDs if a non-leader thread calls
+ * sys_execve() - this must be done without releasing the PID.
+ * (which a detach_pid() would eventually do.)
+ */
+void switch_exec_pids(task_t *leader, task_t *thread)
+{
+ __detach_pid(leader, PIDTYPE_PID);
+ __detach_pid(leader, PIDTYPE_TGID);
+ __detach_pid(leader, PIDTYPE_PGID);
+ __detach_pid(leader, PIDTYPE_SID);
+
+ __detach_pid(thread, PIDTYPE_PID);
+ __detach_pid(thread, PIDTYPE_TGID);
+
+ leader->pid = leader->tgid = thread->pid;
+ thread->pid = thread->tgid;
+
+ attach_pid(thread, PIDTYPE_PID, thread->pid);
+ attach_pid(thread, PIDTYPE_TGID, thread->tgid);
+ attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+ attach_pid(thread, PIDTYPE_SID, thread->signal->session);
+ list_add_tail(&thread->tasks, &init_task.tasks);
+
+ attach_pid(leader, PIDTYPE_PID, leader->pid);
+ attach_pid(leader, PIDTYPE_TGID, leader->tgid);
+ attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+ attach_pid(leader, PIDTYPE_SID, leader->signal->session);
+}
+
+/*
+ * The pid hash table is scaled according to the amount of memory in the
+ * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
+ * more.
+ */
+void __init pidhash_init(void)
+{
+ int i, j, pidhash_size;
+ unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
+
+ pidhash_shift = max(4, fls(megabytes * 4));
+ pidhash_shift = min(12, pidhash_shift);
+ pidhash_size = 1 << pidhash_shift;
+
+ printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
+ pidhash_size, pidhash_shift,
+ PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
+
+ for (i = 0; i < PIDTYPE_MAX; i++) {
+ pid_hash[i] = alloc_bootmem(pidhash_size *
+ sizeof(*(pid_hash[i])));
+ if (!pid_hash[i])
+ panic("Could not alloc pidhash!\n");
+ for (j = 0; j < pidhash_size; j++)
+ INIT_HLIST_HEAD(&pid_hash[i][j]);
+ }
+}
+
+void __init pidmap_init(void)
+{
+ int i;
+
+ pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+ set_bit(0, pidmap_array->page);
+ atomic_dec(&pidmap_array->nr_free);
+
+ /*
+ * Allocate PID 0, and hash it via all PID types:
+ */
+
+ for (i = 0; i < PIDTYPE_MAX; i++)
+ attach_pid(current, i, 0);
+}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
new file mode 100644
index 000000000000..ad85d3f0dcc4
--- /dev/null
+++ b/kernel/posix-cpu-timers.c
@@ -0,0 +1,1559 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+
+static int check_clock(clockid_t which_clock)
+{
+ int error = 0;
+ struct task_struct *p;
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+
+ if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ if (pid == 0)
+ return 0;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
+ p->tgid != current->tgid : p->tgid != pid)) {
+ error = -EINVAL;
+ }
+ read_unlock(&tasklist_lock);
+
+ return error;
+}
+
+static inline union cpu_time_count
+timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+{
+ union cpu_time_count ret;
+ ret.sched = 0; /* high half always zero when .cpu used */
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+ } else {
+ ret.cpu = timespec_to_cputime(tp);
+ }
+ return ret;
+}
+
+static void sample_to_timespec(clockid_t which_clock,
+ union cpu_time_count cpu,
+ struct timespec *tp)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ tp->tv_sec = div_long_long_rem(cpu.sched,
+ NSEC_PER_SEC, &tp->tv_nsec);
+ } else {
+ cputime_to_timespec(cpu.cpu, tp);
+ }
+}
+
+static inline int cpu_time_before(clockid_t which_clock,
+ union cpu_time_count now,
+ union cpu_time_count then)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ return now.sched < then.sched;
+ } else {
+ return cputime_lt(now.cpu, then.cpu);
+ }
+}
+static inline void cpu_time_add(clockid_t which_clock,
+ union cpu_time_count *acc,
+ union cpu_time_count val)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ acc->sched += val.sched;
+ } else {
+ acc->cpu = cputime_add(acc->cpu, val.cpu);
+ }
+}
+static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+ union cpu_time_count a,
+ union cpu_time_count b)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ a.sched -= b.sched;
+ } else {
+ a.cpu = cputime_sub(a.cpu, b.cpu);
+ }
+ return a;
+}
+
+/*
+ * Update expiry time from increment, and increase overrun count,
+ * given the current clock sample.
+ */
+static inline void bump_cpu_timer(struct k_itimer *timer,
+ union cpu_time_count now)
+{
+ int i;
+
+ if (timer->it.cpu.incr.sched == 0)
+ return;
+
+ if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+ unsigned long long delta, incr;
+
+ if (now.sched < timer->it.cpu.expires.sched)
+ return;
+ incr = timer->it.cpu.incr.sched;
+ delta = now.sched + incr - timer->it.cpu.expires.sched;
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; incr < delta - incr; i++)
+ incr = incr << 1;
+ for (; i >= 0; incr >>= 1, i--) {
+ if (delta <= incr)
+ continue;
+ timer->it.cpu.expires.sched += incr;
+ timer->it_overrun += 1 << i;
+ delta -= incr;
+ }
+ } else {
+ cputime_t delta, incr;
+
+ if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+ return;
+ incr = timer->it.cpu.incr.cpu;
+ delta = cputime_sub(cputime_add(now.cpu, incr),
+ timer->it.cpu.expires.cpu);
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
+ incr = cputime_add(incr, incr);
+ for (; i >= 0; incr = cputime_halve(incr), i--) {
+ if (cputime_le(delta, incr))
+ continue;
+ timer->it.cpu.expires.cpu =
+ cputime_add(timer->it.cpu.expires.cpu, incr);
+ timer->it_overrun += 1 << i;
+ delta = cputime_sub(delta, incr);
+ }
+ }
+}
+
+static inline cputime_t prof_ticks(struct task_struct *p)
+{
+ return cputime_add(p->utime, p->stime);
+}
+static inline cputime_t virt_ticks(struct task_struct *p)
+{
+ return p->utime;
+}
+static inline unsigned long long sched_ns(struct task_struct *p)
+{
+ return (p == current) ? current_sched_time(p) : p->sched_time;
+}
+
+int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+ int error = check_clock(which_clock);
+ if (!error) {
+ tp->tv_sec = 0;
+ tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ /*
+ * If sched_clock is using a cycle counter, we
+ * don't have any idea of its true resolution
+ * exported, but it is much more than 1s/HZ.
+ */
+ tp->tv_nsec = 1;
+ }
+ }
+ return error;
+}
+
+int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+{
+ /*
+ * You can never reset a CPU clock, but we check for other errors
+ * in the call before failing with EPERM.
+ */
+ int error = check_clock(which_clock);
+ if (error == 0) {
+ error = -EPERM;
+ }
+ return error;
+}
+
+
+/*
+ * Sample a per-thread clock for the given task.
+ */
+static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = prof_ticks(p);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = virt_ticks(p);
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = sched_ns(p);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
+ */
+static int cpu_clock_sample_group_locked(unsigned int clock_idx,
+ struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ struct task_struct *t = p;
+ switch (clock_idx) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
+ do {
+ cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
+ t = next_thread(t);
+ } while (t != p);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = p->signal->utime;
+ do {
+ cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
+ t = next_thread(t);
+ } while (t != p);
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = p->signal->sched_time;
+ /* Add in each other live thread. */
+ while ((t = next_thread(t)) != p) {
+ cpu->sched += t->sched_time;
+ }
+ if (p->tgid == current->tgid) {
+ /*
+ * We're sampling ourselves, so include the
+ * cycles not yet banked. We still omit
+ * other threads running on other CPUs,
+ * so the total can always be behind as
+ * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
+ */
+ cpu->sched += current_sched_time(current);
+ } else {
+ cpu->sched += p->sched_time;
+ }
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(clockid_t which_clock,
+ struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ int ret;
+ unsigned long flags;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
+ cpu);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ return ret;
+}
+
+
+int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+ int error = -EINVAL;
+ union cpu_time_count rtn;
+
+ if (pid == 0) {
+ /*
+ * Special case constant value for our own clocks.
+ * We don't have to do any lookup to find ourselves.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ /*
+ * Sampling just ourselves we can do with no locking.
+ */
+ error = cpu_clock_sample(which_clock,
+ current, &rtn);
+ } else {
+ read_lock(&tasklist_lock);
+ error = cpu_clock_sample_group(which_clock,
+ current, &rtn);
+ read_unlock(&tasklist_lock);
+ }
+ } else {
+ /*
+ * Find the given PID, and validate that the caller
+ * should be able to see it.
+ */
+ struct task_struct *p;
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (p) {
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ if (p->tgid == current->tgid) {
+ error = cpu_clock_sample(which_clock,
+ p, &rtn);
+ }
+ } else if (p->tgid == pid && p->signal) {
+ error = cpu_clock_sample_group(which_clock,
+ p, &rtn);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ if (error)
+ return error;
+ sample_to_timespec(which_clock, rtn, tp);
+ return 0;
+}
+
+
+/*
+ * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
+ * This is called from sys_timer_create with the new timer already locked.
+ */
+int posix_cpu_timer_create(struct k_itimer *new_timer)
+{
+ int ret = 0;
+ const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
+ struct task_struct *p;
+
+ if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&new_timer->it.cpu.entry);
+ new_timer->it.cpu.incr.sched = 0;
+ new_timer->it.cpu.expires.sched = 0;
+
+ read_lock(&tasklist_lock);
+ if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
+ if (pid == 0) {
+ p = current;
+ } else {
+ p = find_task_by_pid(pid);
+ if (p && p->tgid != current->tgid)
+ p = NULL;
+ }
+ } else {
+ if (pid == 0) {
+ p = current->group_leader;
+ } else {
+ p = find_task_by_pid(pid);
+ if (p && p->tgid != pid)
+ p = NULL;
+ }
+ }
+ new_timer->it.cpu.task = p;
+ if (p) {
+ get_task_struct(p);
+ } else {
+ ret = -EINVAL;
+ }
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * Clean up a CPU-clock timer that is about to be destroyed.
+ * This is called from timer deletion with the timer already locked.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again. (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_del(struct k_itimer *timer)
+{
+ struct task_struct *p = timer->it.cpu.task;
+
+ if (timer->it.cpu.firing)
+ return TIMER_RETRY;
+
+ if (unlikely(p == NULL))
+ return 0;
+
+ if (!list_empty(&timer->it.cpu.entry)) {
+ read_lock(&tasklist_lock);
+ if (unlikely(p->signal == NULL)) {
+ /*
+ * We raced with the reaping of the task.
+ * The deletion should have cleared us off the list.
+ */
+ BUG_ON(!list_empty(&timer->it.cpu.entry));
+ } else {
+ /*
+ * Take us off the task's timer list.
+ */
+ spin_lock(&p->sighand->siglock);
+ list_del(&timer->it.cpu.entry);
+ spin_unlock(&p->sighand->siglock);
+ }
+ read_unlock(&tasklist_lock);
+ }
+ put_task_struct(p);
+
+ return 0;
+}
+
+/*
+ * Clean out CPU timers still ticking when a thread exited. The task
+ * pointer is cleared, and the expiry time is replaced with the residual
+ * time for later timer_gettime calls to return.
+ * This must be called with the siglock held.
+ */
+static void cleanup_timers(struct list_head *head,
+ cputime_t utime, cputime_t stime,
+ unsigned long long sched_time)
+{
+ struct cpu_timer_list *timer, *next;
+ cputime_t ptime = cputime_add(utime, stime);
+
+ list_for_each_entry_safe(timer, next, head, entry) {
+ timer->task = NULL;
+ list_del_init(&timer->entry);
+ if (cputime_lt(timer->expires.cpu, ptime)) {
+ timer->expires.cpu = cputime_zero;
+ } else {
+ timer->expires.cpu = cputime_sub(timer->expires.cpu,
+ ptime);
+ }
+ }
+
+ ++head;
+ list_for_each_entry_safe(timer, next, head, entry) {
+ timer->task = NULL;
+ list_del_init(&timer->entry);
+ if (cputime_lt(timer->expires.cpu, utime)) {
+ timer->expires.cpu = cputime_zero;
+ } else {
+ timer->expires.cpu = cputime_sub(timer->expires.cpu,
+ utime);
+ }
+ }
+
+ ++head;
+ list_for_each_entry_safe(timer, next, head, entry) {
+ timer->task = NULL;
+ list_del_init(&timer->entry);
+ if (timer->expires.sched < sched_time) {
+ timer->expires.sched = 0;
+ } else {
+ timer->expires.sched -= sched_time;
+ }
+ }
+}
+
+/*
+ * These are both called with the siglock held, when the current thread
+ * is being reaped. When the final (leader) thread in the group is reaped,
+ * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
+ */
+void posix_cpu_timers_exit(struct task_struct *tsk)
+{
+ cleanup_timers(tsk->cpu_timers,
+ tsk->utime, tsk->stime, tsk->sched_time);
+
+}
+void posix_cpu_timers_exit_group(struct task_struct *tsk)
+{
+ cleanup_timers(tsk->signal->cpu_timers,
+ cputime_add(tsk->utime, tsk->signal->utime),
+ cputime_add(tsk->stime, tsk->signal->stime),
+ tsk->sched_time + tsk->signal->sched_time);
+}
+
+
+/*
+ * Set the expiry times of all the threads in the process so one of them
+ * will go off before the process cumulative expiry total is reached.
+ */
+static void process_timer_rebalance(struct task_struct *p,
+ unsigned int clock_idx,
+ union cpu_time_count expires,
+ union cpu_time_count val)
+{
+ cputime_t ticks, left;
+ unsigned long long ns, nsleft;
+ struct task_struct *t = p;
+ unsigned int nthreads = atomic_read(&p->signal->live);
+
+ switch (clock_idx) {
+ default:
+ BUG();
+ break;
+ case CPUCLOCK_PROF:
+ left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+ nthreads);
+ do {
+ if (!unlikely(t->exit_state)) {
+ ticks = cputime_add(prof_ticks(t), left);
+ if (cputime_eq(t->it_prof_expires,
+ cputime_zero) ||
+ cputime_gt(t->it_prof_expires, ticks)) {
+ t->it_prof_expires = ticks;
+ }
+ }
+ t = next_thread(t);
+ } while (t != p);
+ break;
+ case CPUCLOCK_VIRT:
+ left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+ nthreads);
+ do {
+ if (!unlikely(t->exit_state)) {
+ ticks = cputime_add(virt_ticks(t), left);
+ if (cputime_eq(t->it_virt_expires,
+ cputime_zero) ||
+ cputime_gt(t->it_virt_expires, ticks)) {
+ t->it_virt_expires = ticks;
+ }
+ }
+ t = next_thread(t);
+ } while (t != p);
+ break;
+ case CPUCLOCK_SCHED:
+ nsleft = expires.sched - val.sched;
+ do_div(nsleft, nthreads);
+ do {
+ if (!unlikely(t->exit_state)) {
+ ns = t->sched_time + nsleft;
+ if (t->it_sched_expires == 0 ||
+ t->it_sched_expires > ns) {
+ t->it_sched_expires = ns;
+ }
+ }
+ t = next_thread(t);
+ } while (t != p);
+ break;
+ }
+}
+
+static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+{
+ /*
+ * That's all for this thread or process.
+ * We leave our residual in expires to be reported.
+ */
+ put_task_struct(timer->it.cpu.task);
+ timer->it.cpu.task = NULL;
+ timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
+ timer->it.cpu.expires,
+ now);
+}
+
+/*
+ * Insert the timer on the appropriate list before any timers that
+ * expire later. This must be called with the tasklist_lock held
+ * for reading, and interrupts disabled.
+ */
+static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ struct list_head *head, *listpos;
+ struct cpu_timer_list *const nt = &timer->it.cpu;
+ struct cpu_timer_list *next;
+ unsigned long i;
+
+ head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
+ p->cpu_timers : p->signal->cpu_timers);
+ head += CPUCLOCK_WHICH(timer->it_clock);
+
+ BUG_ON(!irqs_disabled());
+ spin_lock(&p->sighand->siglock);
+
+ listpos = head;
+ if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+ list_for_each_entry(next, head, entry) {
+ if (next->expires.sched > nt->expires.sched) {
+ listpos = &next->entry;
+ break;
+ }
+ }
+ } else {
+ list_for_each_entry(next, head, entry) {
+ if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
+ listpos = &next->entry;
+ break;
+ }
+ }
+ }
+ list_add(&nt->entry, listpos);
+
+ if (listpos == head) {
+ /*
+ * We are the new earliest-expiring timer.
+ * If we are a thread timer, there can always
+ * be a process timer telling us to stop earlier.
+ */
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ switch (CPUCLOCK_WHICH(timer->it_clock)) {
+ default:
+ BUG();
+ case CPUCLOCK_PROF:
+ if (cputime_eq(p->it_prof_expires,
+ cputime_zero) ||
+ cputime_gt(p->it_prof_expires,
+ nt->expires.cpu))
+ p->it_prof_expires = nt->expires.cpu;
+ break;
+ case CPUCLOCK_VIRT:
+ if (cputime_eq(p->it_virt_expires,
+ cputime_zero) ||
+ cputime_gt(p->it_virt_expires,
+ nt->expires.cpu))
+ p->it_virt_expires = nt->expires.cpu;
+ break;
+ case CPUCLOCK_SCHED:
+ if (p->it_sched_expires == 0 ||
+ p->it_sched_expires > nt->expires.sched)
+ p->it_sched_expires = nt->expires.sched;
+ break;
+ }
+ } else {
+ /*
+ * For a process timer, we must balance
+ * all the live threads' expirations.
+ */
+ switch (CPUCLOCK_WHICH(timer->it_clock)) {
+ default:
+ BUG();
+ case CPUCLOCK_VIRT:
+ if (!cputime_eq(p->signal->it_virt_expires,
+ cputime_zero) &&
+ cputime_lt(p->signal->it_virt_expires,
+ timer->it.cpu.expires.cpu))
+ break;
+ goto rebalance;
+ case CPUCLOCK_PROF:
+ if (!cputime_eq(p->signal->it_prof_expires,
+ cputime_zero) &&
+ cputime_lt(p->signal->it_prof_expires,
+ timer->it.cpu.expires.cpu))
+ break;
+ i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
+ if (i != RLIM_INFINITY &&
+ i <= cputime_to_secs(timer->it.cpu.expires.cpu))
+ break;
+ goto rebalance;
+ case CPUCLOCK_SCHED:
+ rebalance:
+ process_timer_rebalance(
+ timer->it.cpu.task,
+ CPUCLOCK_WHICH(timer->it_clock),
+ timer->it.cpu.expires, now);
+ break;
+ }
+ }
+ }
+
+ spin_unlock(&p->sighand->siglock);
+}
+
+/*
+ * The timer is locked, fire it and arrange for its reload.
+ */
+static void cpu_timer_fire(struct k_itimer *timer)
+{
+ if (unlikely(timer->sigq == NULL)) {
+ /*
+ * This a special case for clock_nanosleep,
+ * not a normal timer from sys_timer_create.
+ */
+ wake_up_process(timer->it_process);
+ timer->it.cpu.expires.sched = 0;
+ } else if (timer->it.cpu.incr.sched == 0) {
+ /*
+ * One-shot timer. Clear it as soon as it's fired.
+ */
+ posix_timer_event(timer, 0);
+ timer->it.cpu.expires.sched = 0;
+ } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
+ /*
+ * The signal did not get queued because the signal
+ * was ignored, so we won't get any callback to
+ * reload the timer. But we need to keep it
+ * ticking in case the signal is deliverable next time.
+ */
+ posix_cpu_timer_schedule(timer);
+ }
+}
+
+/*
+ * Guts of sys_timer_settime for CPU timers.
+ * This is called with the timer locked and interrupts disabled.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again. (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+ struct itimerspec *new, struct itimerspec *old)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ union cpu_time_count old_expires, new_expires, val;
+ int ret;
+
+ if (unlikely(p == NULL)) {
+ /*
+ * Timer refers to a dead task's clock.
+ */
+ return -ESRCH;
+ }
+
+ new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+
+ read_lock(&tasklist_lock);
+ /*
+ * We need the tasklist_lock to protect against reaping that
+ * clears p->signal. If p has just been reaped, we can no
+ * longer get any information about it at all.
+ */
+ if (unlikely(p->signal == NULL)) {
+ read_unlock(&tasklist_lock);
+ put_task_struct(p);
+ timer->it.cpu.task = NULL;
+ return -ESRCH;
+ }
+
+ /*
+ * Disarm any old timer after extracting its expiry time.
+ */
+ BUG_ON(!irqs_disabled());
+ spin_lock(&p->sighand->siglock);
+ old_expires = timer->it.cpu.expires;
+ list_del_init(&timer->it.cpu.entry);
+ spin_unlock(&p->sighand->siglock);
+
+ /*
+ * We need to sample the current value to convert the new
+ * value from to relative and absolute, and to convert the
+ * old value from absolute to relative. To set a process
+ * timer, we need a sample to balance the thread expiry
+ * times (in arm_timer). With an absolute time, we must
+ * check if it's already passed. In short, we need a sample.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &val);
+ } else {
+ cpu_clock_sample_group(timer->it_clock, p, &val);
+ }
+
+ if (old) {
+ if (old_expires.sched == 0) {
+ old->it_value.tv_sec = 0;
+ old->it_value.tv_nsec = 0;
+ } else {
+ /*
+ * Update the timer in case it has
+ * overrun already. If it has,
+ * we'll report it as having overrun
+ * and with the next reloaded timer
+ * already ticking, though we are
+ * swallowing that pending
+ * notification here to install the
+ * new setting.
+ */
+ bump_cpu_timer(timer, val);
+ if (cpu_time_before(timer->it_clock, val,
+ timer->it.cpu.expires)) {
+ old_expires = cpu_time_sub(
+ timer->it_clock,
+ timer->it.cpu.expires, val);
+ sample_to_timespec(timer->it_clock,
+ old_expires,
+ &old->it_value);
+ } else {
+ old->it_value.tv_nsec = 1;
+ old->it_value.tv_sec = 0;
+ }
+ }
+ }
+
+ if (unlikely(timer->it.cpu.firing)) {
+ /*
+ * We are colliding with the timer actually firing.
+ * Punt after filling in the timer's old value, and
+ * disable this firing since we are already reporting
+ * it as an overrun (thanks to bump_cpu_timer above).
+ */
+ read_unlock(&tasklist_lock);
+ timer->it.cpu.firing = -1;
+ ret = TIMER_RETRY;
+ goto out;
+ }
+
+ if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
+ cpu_time_add(timer->it_clock, &new_expires, val);
+ }
+
+ /*
+ * Install the new expiry time (or zero).
+ * For a timer with no notification action, we don't actually
+ * arm the timer (we'll just fake it for timer_gettime).
+ */
+ timer->it.cpu.expires = new_expires;
+ if (new_expires.sched != 0 &&
+ (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+ cpu_time_before(timer->it_clock, val, new_expires)) {
+ arm_timer(timer, val);
+ }
+
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Install the new reload setting, and
+ * set up the signal and overrun bookkeeping.
+ */
+ timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
+ &new->it_interval);
+
+ /*
+ * This acts as a modification timestamp for the timer,
+ * so any automatic reload attempt will punt on seeing
+ * that we have reset the timer manually.
+ */
+ timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
+ ~REQUEUE_PENDING;
+ timer->it_overrun_last = 0;
+ timer->it_overrun = -1;
+
+ if (new_expires.sched != 0 &&
+ (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+ !cpu_time_before(timer->it_clock, val, new_expires)) {
+ /*
+ * The designated time already passed, so we notify
+ * immediately, even if the thread never runs to
+ * accumulate more time on this clock.
+ */
+ cpu_timer_fire(timer);
+ }
+
+ ret = 0;
+ out:
+ if (old) {
+ sample_to_timespec(timer->it_clock,
+ timer->it.cpu.incr, &old->it_interval);
+ }
+ return ret;
+}
+
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+{
+ union cpu_time_count now;
+ struct task_struct *p = timer->it.cpu.task;
+ int clear_dead;
+
+ /*
+ * Easy part: convert the reload time.
+ */
+ sample_to_timespec(timer->it_clock,
+ timer->it.cpu.incr, &itp->it_interval);
+
+ if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */
+ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+ return;
+ }
+
+ if (unlikely(p == NULL)) {
+ /*
+ * This task already died and the timer will never fire.
+ * In this case, expires is actually the dead value.
+ */
+ dead:
+ sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+ &itp->it_value);
+ return;
+ }
+
+ /*
+ * Sample the clock to take the difference with the expiry time.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &now);
+ clear_dead = p->exit_state;
+ } else {
+ read_lock(&tasklist_lock);
+ if (unlikely(p->signal == NULL)) {
+ /*
+ * The process has been reaped.
+ * We can't even collect a sample any more.
+ * Call the timer disarmed, nothing else to do.
+ */
+ put_task_struct(p);
+ timer->it.cpu.task = NULL;
+ timer->it.cpu.expires.sched = 0;
+ read_unlock(&tasklist_lock);
+ goto dead;
+ } else {
+ cpu_clock_sample_group(timer->it_clock, p, &now);
+ clear_dead = (unlikely(p->exit_state) &&
+ thread_group_empty(p));
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+ if (timer->it.cpu.incr.sched == 0 &&
+ cpu_time_before(timer->it_clock,
+ timer->it.cpu.expires, now)) {
+ /*
+ * Do-nothing timer expired and has no reload,
+ * so it's as if it was never set.
+ */
+ timer->it.cpu.expires.sched = 0;
+ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+ return;
+ }
+ /*
+ * Account for any expirations and reloads that should
+ * have happened.
+ */
+ bump_cpu_timer(timer, now);
+ }
+
+ if (unlikely(clear_dead)) {
+ /*
+ * We've noticed that the thread is dead, but
+ * not yet reaped. Take this opportunity to
+ * drop our task ref.
+ */
+ clear_dead_task(timer, now);
+ goto dead;
+ }
+
+ if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+ sample_to_timespec(timer->it_clock,
+ cpu_time_sub(timer->it_clock,
+ timer->it.cpu.expires, now),
+ &itp->it_value);
+ } else {
+ /*
+ * The timer should have expired already, but the firing
+ * hasn't taken place yet. Say it's just about to expire.
+ */
+ itp->it_value.tv_nsec = 1;
+ itp->it_value.tv_sec = 0;
+ }
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them off
+ * the tsk->cpu_timers[N] list onto the firing list. Here we update the
+ * tsk->it_*_expires values to reflect the remaining thread CPU timers.
+ */
+static void check_thread_timers(struct task_struct *tsk,
+ struct list_head *firing)
+{
+ struct list_head *timers = tsk->cpu_timers;
+
+ tsk->it_prof_expires = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list,
+ entry);
+ if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+ tsk->it_prof_expires = t->expires.cpu;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ ++timers;
+ tsk->it_virt_expires = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list,
+ entry);
+ if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+ tsk->it_virt_expires = t->expires.cpu;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ ++timers;
+ tsk->it_sched_expires = 0;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list,
+ entry);
+ if (tsk->sched_time < t->expires.sched) {
+ tsk->it_sched_expires = t->expires.sched;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them
+ * off the tsk->*_timers list onto the firing list. Per-thread timers
+ * have already been taken off.
+ */
+static void check_process_timers(struct task_struct *tsk,
+ struct list_head *firing)
+{
+ struct signal_struct *const sig = tsk->signal;
+ cputime_t utime, stime, ptime, virt_expires, prof_expires;
+ unsigned long long sched_time, sched_expires;
+ struct task_struct *t;
+ struct list_head *timers = sig->cpu_timers;
+
+ /*
+ * Don't sample the current process CPU clocks if there are no timers.
+ */
+ if (list_empty(&timers[CPUCLOCK_PROF]) &&
+ cputime_eq(sig->it_prof_expires, cputime_zero) &&
+ sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
+ list_empty(&timers[CPUCLOCK_VIRT]) &&
+ cputime_eq(sig->it_virt_expires, cputime_zero) &&
+ list_empty(&timers[CPUCLOCK_SCHED]))
+ return;
+
+ /*
+ * Collect the current process totals.
+ */
+ utime = sig->utime;
+ stime = sig->stime;
+ sched_time = sig->sched_time;
+ t = tsk;
+ do {
+ utime = cputime_add(utime, t->utime);
+ stime = cputime_add(stime, t->stime);
+ sched_time += t->sched_time;
+ t = next_thread(t);
+ } while (t != tsk);
+ ptime = cputime_add(utime, stime);
+
+ prof_expires = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list,
+ entry);
+ if (cputime_lt(ptime, t->expires.cpu)) {
+ prof_expires = t->expires.cpu;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ ++timers;
+ virt_expires = cputime_zero;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list,
+ entry);
+ if (cputime_lt(utime, t->expires.cpu)) {
+ virt_expires = t->expires.cpu;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ ++timers;
+ sched_expires = 0;
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list,
+ entry);
+ if (sched_time < t->expires.sched) {
+ sched_expires = t->expires.sched;
+ break;
+ }
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ /*
+ * Check for the special case process timers.
+ */
+ if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+ if (cputime_ge(ptime, sig->it_prof_expires)) {
+ /* ITIMER_PROF fires and reloads. */
+ sig->it_prof_expires = sig->it_prof_incr;
+ if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+ sig->it_prof_expires = cputime_add(
+ sig->it_prof_expires, ptime);
+ }
+ __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
+ }
+ if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
+ (cputime_eq(prof_expires, cputime_zero) ||
+ cputime_lt(sig->it_prof_expires, prof_expires))) {
+ prof_expires = sig->it_prof_expires;
+ }
+ }
+ if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+ if (cputime_ge(utime, sig->it_virt_expires)) {
+ /* ITIMER_VIRTUAL fires and reloads. */
+ sig->it_virt_expires = sig->it_virt_incr;
+ if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+ sig->it_virt_expires = cputime_add(
+ sig->it_virt_expires, utime);
+ }
+ __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
+ }
+ if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
+ (cputime_eq(virt_expires, cputime_zero) ||
+ cputime_lt(sig->it_virt_expires, virt_expires))) {
+ virt_expires = sig->it_virt_expires;
+ }
+ }
+ if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+ unsigned long psecs = cputime_to_secs(ptime);
+ cputime_t x;
+ if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+ /*
+ * At the hard limit, we just die.
+ * No need to calculate anything else now.
+ */
+ __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ return;
+ }
+ if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+ /*
+ * At the soft limit, send a SIGXCPU every second.
+ */
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ if (sig->rlim[RLIMIT_CPU].rlim_cur
+ < sig->rlim[RLIMIT_CPU].rlim_max) {
+ sig->rlim[RLIMIT_CPU].rlim_cur++;
+ }
+ }
+ x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+ if (cputime_eq(prof_expires, cputime_zero) ||
+ cputime_lt(x, prof_expires)) {
+ prof_expires = x;
+ }
+ }
+
+ if (!cputime_eq(prof_expires, cputime_zero) ||
+ !cputime_eq(virt_expires, cputime_zero) ||
+ sched_expires != 0) {
+ /*
+ * Rebalance the threads' expiry times for the remaining
+ * process CPU timers.
+ */
+
+ cputime_t prof_left, virt_left, ticks;
+ unsigned long long sched_left, sched;
+ const unsigned int nthreads = atomic_read(&sig->live);
+
+ prof_left = cputime_sub(prof_expires, utime);
+ prof_left = cputime_sub(prof_left, stime);
+ prof_left = cputime_div(prof_left, nthreads);
+ virt_left = cputime_sub(virt_expires, utime);
+ virt_left = cputime_div(virt_left, nthreads);
+ if (sched_expires) {
+ sched_left = sched_expires - sched_time;
+ do_div(sched_left, nthreads);
+ } else {
+ sched_left = 0;
+ }
+ t = tsk;
+ do {
+ ticks = cputime_add(cputime_add(t->utime, t->stime),
+ prof_left);
+ if (!cputime_eq(prof_expires, cputime_zero) &&
+ (cputime_eq(t->it_prof_expires, cputime_zero) ||
+ cputime_gt(t->it_prof_expires, ticks))) {
+ t->it_prof_expires = ticks;
+ }
+
+ ticks = cputime_add(t->utime, virt_left);
+ if (!cputime_eq(virt_expires, cputime_zero) &&
+ (cputime_eq(t->it_virt_expires, cputime_zero) ||
+ cputime_gt(t->it_virt_expires, ticks))) {
+ t->it_virt_expires = ticks;
+ }
+
+ sched = t->sched_time + sched_left;
+ if (sched_expires && (t->it_sched_expires == 0 ||
+ t->it_sched_expires > sched)) {
+ t->it_sched_expires = sched;
+ }
+
+ do {
+ t = next_thread(t);
+ } while (unlikely(t->exit_state));
+ } while (t != tsk);
+ }
+}
+
+/*
+ * This is called from the signal code (via do_schedule_next_timer)
+ * when the last timer signal was delivered and we have to reload the timer.
+ */
+void posix_cpu_timer_schedule(struct k_itimer *timer)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ union cpu_time_count now;
+
+ if (unlikely(p == NULL))
+ /*
+ * The task was cleaned up already, no future firings.
+ */
+ return;
+
+ /*
+ * Fetch the current sample and update the timer's expiry time.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &now);
+ bump_cpu_timer(timer, now);
+ if (unlikely(p->exit_state)) {
+ clear_dead_task(timer, now);
+ return;
+ }
+ read_lock(&tasklist_lock); /* arm_timer needs it. */
+ } else {
+ read_lock(&tasklist_lock);
+ if (unlikely(p->signal == NULL)) {
+ /*
+ * The process has been reaped.
+ * We can't even collect a sample any more.
+ */
+ put_task_struct(p);
+ timer->it.cpu.task = p = NULL;
+ timer->it.cpu.expires.sched = 0;
+ read_unlock(&tasklist_lock);
+ return;
+ } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
+ /*
+ * We've noticed that the thread is dead, but
+ * not yet reaped. Take this opportunity to
+ * drop our task ref.
+ */
+ clear_dead_task(timer, now);
+ read_unlock(&tasklist_lock);
+ return;
+ }
+ cpu_clock_sample_group(timer->it_clock, p, &now);
+ bump_cpu_timer(timer, now);
+ /* Leave the tasklist_lock locked for the call below. */
+ }
+
+ /*
+ * Now re-arm for the new expiry time.
+ */
+ arm_timer(timer, now);
+
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ LIST_HEAD(firing);
+ struct k_itimer *timer, *next;
+
+ BUG_ON(!irqs_disabled());
+
+#define UNEXPIRED(clock) \
+ (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
+ cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
+
+ if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
+ (tsk->it_sched_expires == 0 ||
+ tsk->sched_time < tsk->it_sched_expires))
+ return;
+
+#undef UNEXPIRED
+
+ BUG_ON(tsk->exit_state);
+
+ /*
+ * Double-check with locks held.
+ */
+ read_lock(&tasklist_lock);
+ spin_lock(&tsk->sighand->siglock);
+
+ /*
+ * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+ * all the timers that are firing, and put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+ check_process_timers(tsk, &firing);
+
+ /*
+ * We must release these locks before taking any timer's lock.
+ * There is a potential race with timer deletion here, as the
+ * siglock now protects our private firing list. We have set
+ * the firing flag in each timer, so that a deletion attempt
+ * that gets the timer lock before we do will give it up and
+ * spin until we've taken care of that timer below.
+ */
+ spin_unlock(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Now that all the timers on our list have the firing flag,
+ * noone will touch their list entries but us. We'll take
+ * each timer's lock before clearing its firing flag, so no
+ * timer call will interfere.
+ */
+ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+ int firing;
+ spin_lock(&timer->it_lock);
+ list_del_init(&timer->it.cpu.entry);
+ firing = timer->it.cpu.firing;
+ timer->it.cpu.firing = 0;
+ /*
+ * The firing flag is -1 if we collided with a reset
+ * of the timer, which already reported this
+ * almost-firing as an overrun. So don't generate an event.
+ */
+ if (likely(firing >= 0)) {
+ cpu_timer_fire(timer);
+ }
+ spin_unlock(&timer->it_lock);
+ }
+}
+
+/*
+ * Set one of the process-wide special case CPU timers.
+ * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
+ * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
+ * absolute; non-null for ITIMER_*, where *newval is relative and we update
+ * it to be absolute, *oldval is absolute and we update it to be relative.
+ */
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+ cputime_t *newval, cputime_t *oldval)
+{
+ union cpu_time_count now;
+ struct list_head *head;
+
+ BUG_ON(clock_idx == CPUCLOCK_SCHED);
+ cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+
+ if (oldval) {
+ if (!cputime_eq(*oldval, cputime_zero)) {
+ if (cputime_le(*oldval, now.cpu)) {
+ /* Just about to fire. */
+ *oldval = jiffies_to_cputime(1);
+ } else {
+ *oldval = cputime_sub(*oldval, now.cpu);
+ }
+ }
+
+ if (cputime_eq(*newval, cputime_zero))
+ return;
+ *newval = cputime_add(*newval, now.cpu);
+
+ /*
+ * If the RLIMIT_CPU timer will expire before the
+ * ITIMER_PROF timer, we have nothing else to do.
+ */
+ if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
+ < cputime_to_secs(*newval))
+ return;
+ }
+
+ /*
+ * Check whether there are any process timers already set to fire
+ * before this one. If so, we don't have anything more to do.
+ */
+ head = &tsk->signal->cpu_timers[clock_idx];
+ if (list_empty(head) ||
+ cputime_ge(list_entry(head->next,
+ struct cpu_timer_list, entry)->expires.cpu,
+ *newval)) {
+ /*
+ * Rejigger each thread's expiry time so that one will
+ * notice before we hit the process-cumulative expiry time.
+ */
+ union cpu_time_count expires = { .sched = 0 };
+ expires.cpu = *newval;
+ process_timer_rebalance(tsk, clock_idx, expires, now);
+ }
+}
+
+static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
+
+int posix_cpu_nsleep(clockid_t which_clock, int flags,
+ struct timespec *rqtp)
+{
+ struct restart_block *restart_block =
+ &current_thread_info()->restart_block;
+ struct k_itimer timer;
+ int error;
+
+ /*
+ * Diagnose required errors first.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock) &&
+ (CPUCLOCK_PID(which_clock) == 0 ||
+ CPUCLOCK_PID(which_clock) == current->pid))
+ return -EINVAL;
+
+ /*
+ * Set up a temporary timer and then wait for it to go off.
+ */
+ memset(&timer, 0, sizeof timer);
+ spin_lock_init(&timer.it_lock);
+ timer.it_clock = which_clock;
+ timer.it_overrun = -1;
+ error = posix_cpu_timer_create(&timer);
+ timer.it_process = current;
+ if (!error) {
+ struct timespec __user *rmtp;
+ static struct itimerspec zero_it;
+ struct itimerspec it = { .it_value = *rqtp,
+ .it_interval = {} };
+
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+ if (error) {
+ spin_unlock_irq(&timer.it_lock);
+ return error;
+ }
+
+ while (!signal_pending(current)) {
+ if (timer.it.cpu.expires.sched == 0) {
+ /*
+ * Our timer fired and was reset.
+ */
+ spin_unlock_irq(&timer.it_lock);
+ return 0;
+ }
+
+ /*
+ * Block until cpu_timer_fire (or a signal) wakes us.
+ */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&timer.it_lock);
+ schedule();
+ spin_lock_irq(&timer.it_lock);
+ }
+
+ /*
+ * We were interrupted by a signal.
+ */
+ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+ posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+ spin_unlock_irq(&timer.it_lock);
+
+ if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+ /*
+ * It actually did fire already.
+ */
+ return 0;
+ }
+
+ /*
+ * Report back to the user the time still remaining.
+ */
+ rmtp = (struct timespec __user *) restart_block->arg1;
+ if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
+ copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_clock_nanosleep_restart;
+ /* Caller already set restart_block->arg1 */
+ restart_block->arg0 = which_clock;
+ restart_block->arg2 = rqtp->tv_sec;
+ restart_block->arg3 = rqtp->tv_nsec;
+
+ error = -ERESTART_RESTARTBLOCK;
+ }
+
+ return error;
+}
+
+static long
+posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->arg0;
+ struct timespec t = { .tv_sec = restart_block->arg2,
+ .tv_nsec = restart_block->arg3 };
+ restart_block->fn = do_no_restart_syscall;
+ return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+}
+
+
+#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+
+static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+ return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
+}
+static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ return posix_cpu_clock_get(PROCESS_CLOCK, tp);
+}
+static int process_cpu_timer_create(struct k_itimer *timer)
+{
+ timer->it_clock = PROCESS_CLOCK;
+ return posix_cpu_timer_create(timer);
+}
+static int process_cpu_nsleep(clockid_t which_clock, int flags,
+ struct timespec *rqtp)
+{
+ return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+}
+static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+ return posix_cpu_clock_getres(THREAD_CLOCK, tp);
+}
+static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ return posix_cpu_clock_get(THREAD_CLOCK, tp);
+}
+static int thread_cpu_timer_create(struct k_itimer *timer)
+{
+ timer->it_clock = THREAD_CLOCK;
+ return posix_cpu_timer_create(timer);
+}
+static int thread_cpu_nsleep(clockid_t which_clock, int flags,
+ struct timespec *rqtp)
+{
+ return -EINVAL;
+}
+
+static __init int init_posix_cpu_timers(void)
+{
+ struct k_clock process = {
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get = process_cpu_clock_get,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
+ };
+ struct k_clock thread = {
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get = thread_cpu_clock_get,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = thread_cpu_timer_create,
+ .nsleep = thread_cpu_nsleep,
+ };
+
+ register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+ register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+
+ return 0;
+}
+__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
new file mode 100644
index 000000000000..fd316c272260
--- /dev/null
+++ b/kernel/posix-timers.c
@@ -0,0 +1,1584 @@
+/*
+ * linux/kernel/posix_timers.c
+ *
+ *
+ * 2002-10-15 Posix Clocks & timers
+ * by George Anzinger george@mvista.com
+ *
+ * Copyright (C) 2002 2003 by MontaVista Software.
+ *
+ * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
+ * Copyright (C) 2004 Boris Hu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
+ */
+
+/* These are all the functions necessary to implement
+ * POSIX clocks & timers
+ */
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/idr.h>
+#include <linux/posix-timers.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+#ifndef div_long_long_rem
+#include <asm/div64.h>
+
+#define div_long_long_rem(dividend,divisor,remainder) ({ \
+ u64 result = dividend; \
+ *remainder = do_div(result,divisor); \
+ result; })
+
+#endif
+#define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */
+
+static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
+{
+ return (u64)mpy1 * mpy2;
+}
+/*
+ * Management arrays for POSIX timers. Timers are kept in slab memory
+ * Timer ids are allocated by an external routine that keeps track of the
+ * id and the timer. The external interface is:
+ *
+ * void *idr_find(struct idr *idp, int id); to find timer_id <id>
+ * int idr_get_new(struct idr *idp, void *ptr); to get a new id and
+ * related it to <ptr>
+ * void idr_remove(struct idr *idp, int id); to release <id>
+ * void idr_init(struct idr *idp); to initialize <idp>
+ * which we supply.
+ * The idr_get_new *may* call slab for more memory so it must not be
+ * called under a spin lock. Likewise idr_remore may release memory
+ * (but it may be ok to do this under a lock...).
+ * idr_find is just a memory look up and is quite fast. A -1 return
+ * indicates that the requested id does not exist.
+ */
+
+/*
+ * Lets keep our timers in a slab cache :-)
+ */
+static kmem_cache_t *posix_timers_cache;
+static struct idr posix_timers_id;
+static DEFINE_SPINLOCK(idr_lock);
+
+/*
+ * Just because the timer is not in the timer list does NOT mean it is
+ * inactive. It could be in the "fire" routine getting a new expire time.
+ */
+#define TIMER_INACTIVE 1
+
+#ifdef CONFIG_SMP
+# define timer_active(tmr) \
+ ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
+# define set_timer_inactive(tmr) \
+ do { \
+ (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
+ } while (0)
+#else
+# define timer_active(tmr) BARFY // error to use outside of SMP
+# define set_timer_inactive(tmr) do { } while (0)
+#endif
+/*
+ * we assume that the new SIGEV_THREAD_ID shares no bits with the other
+ * SIGEV values. Here we put out an error if this assumption fails.
+ */
+#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
+#endif
+
+
+/*
+ * The timer ID is turned into a timer address by idr_find().
+ * Verifying a valid ID consists of:
+ *
+ * a) checking that idr_find() returns other than -1.
+ * b) checking that the timer id matches the one in the timer itself.
+ * c) that the timer owner is in the callers thread group.
+ */
+
+/*
+ * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
+ * to implement others. This structure defines the various
+ * clocks and allows the possibility of adding others. We
+ * provide an interface to add clocks to the table and expect
+ * the "arch" code to add at least one clock that is high
+ * resolution. Here we define the standard CLOCK_REALTIME as a
+ * 1/HZ resolution clock.
+ *
+ * RESOLUTION: Clock resolution is used to round up timer and interval
+ * times, NOT to report clock times, which are reported with as
+ * much resolution as the system can muster. In some cases this
+ * resolution may depend on the underlying clock hardware and
+ * may not be quantifiable until run time, and only then is the
+ * necessary code is written. The standard says we should say
+ * something about this issue in the documentation...
+ *
+ * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ * various clock functions. For clocks that use the standard
+ * system timer code these entries should be NULL. This will
+ * allow dispatch without the overhead of indirect function
+ * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
+ * must supply functions here, even if the function just returns
+ * ENOSYS. The standard POSIX timer management code assumes the
+ * following: 1.) The k_itimer struct (sched.h) is used for the
+ * timer. 2.) The list, it_lock, it_clock, it_id and it_process
+ * fields are not modified by timer code.
+ *
+ * At this time all functions EXCEPT clock_nanosleep can be
+ * redirected by the CLOCKS structure. Clock_nanosleep is in
+ * there, but the code ignores it.
+ *
+ * Permissions: It is assumed that the clock_settime() function defined
+ * for each clock will take care of permission checks. Some
+ * clocks may be set able by any user (i.e. local process
+ * clocks) others not. Currently the only set able clock we
+ * have is CLOCK_REALTIME and its high res counter part, both of
+ * which we beg off on and pass to do_sys_settimeofday().
+ */
+
+static struct k_clock posix_clocks[MAX_CLOCKS];
+/*
+ * We only have one real clock that can be set so we need only one abs list,
+ * even if we should want to have several clocks with differing resolutions.
+ */
+static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
+ .lock = SPIN_LOCK_UNLOCKED};
+
+static void posix_timer_fn(unsigned long);
+static u64 do_posix_clock_monotonic_gettime_parts(
+ struct timespec *tp, struct timespec *mo);
+int do_posix_clock_monotonic_gettime(struct timespec *tp);
+static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+
+static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+{
+ spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+
+/*
+ * Call the k_clock hook function if non-null, or the default function.
+ */
+#define CLOCK_DISPATCH(clock, call, arglist) \
+ ((clock) < 0 ? posix_cpu_##call arglist : \
+ (posix_clocks[clock].call != NULL \
+ ? (*posix_clocks[clock].call) arglist : common_##call arglist))
+
+/*
+ * Default clock hook functions when the struct k_clock passed
+ * to register_posix_clock leaves a function pointer null.
+ *
+ * The function common_CALL is the default implementation for
+ * the function pointer CALL in struct k_clock.
+ */
+
+static inline int common_clock_getres(clockid_t which_clock,
+ struct timespec *tp)
+{
+ tp->tv_sec = 0;
+ tp->tv_nsec = posix_clocks[which_clock].res;
+ return 0;
+}
+
+static inline int common_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ getnstimeofday(tp);
+ return 0;
+}
+
+static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
+{
+ return do_sys_settimeofday(tp, NULL);
+}
+
+static inline int common_timer_create(struct k_itimer *new_timer)
+{
+ INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
+ init_timer(&new_timer->it.real.timer);
+ new_timer->it.real.timer.data = (unsigned long) new_timer;
+ new_timer->it.real.timer.function = posix_timer_fn;
+ set_timer_inactive(new_timer);
+ return 0;
+}
+
+/*
+ * These ones are defined below.
+ */
+static int common_nsleep(clockid_t, int flags, struct timespec *t);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+ struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
+
+/*
+ * Return nonzero iff we know a priori this clockid_t value is bogus.
+ */
+static inline int invalid_clockid(clockid_t which_clock)
+{
+ if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
+ return 0;
+ if ((unsigned) which_clock >= MAX_CLOCKS)
+ return 1;
+ if (posix_clocks[which_clock].clock_getres != NULL)
+ return 0;
+#ifndef CLOCK_DISPATCH_DIRECT
+ if (posix_clocks[which_clock].res != 0)
+ return 0;
+#endif
+ return 1;
+}
+
+
+/*
+ * Initialize everything, well, just everything in Posix clocks/timers ;)
+ */
+static __init int init_posix_timers(void)
+{
+ struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
+ .abs_struct = &abs_list
+ };
+ struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
+ .abs_struct = NULL,
+ .clock_get = do_posix_clock_monotonic_get,
+ .clock_set = do_posix_clock_nosettime
+ };
+
+ register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+ register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof (struct k_itimer), 0, 0, NULL, NULL);
+ idr_init(&posix_timers_id);
+ return 0;
+}
+
+__initcall(init_posix_timers);
+
+static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
+{
+ long sec = tp->tv_sec;
+ long nsec = tp->tv_nsec + res - 1;
+
+ if (nsec > NSEC_PER_SEC) {
+ sec++;
+ nsec -= NSEC_PER_SEC;
+ }
+
+ /*
+ * The scaling constants are defined in <linux/time.h>
+ * The difference between there and here is that we do the
+ * res rounding and compute a 64-bit result (well so does that
+ * but it then throws away the high bits).
+ */
+ *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
+ (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >>
+ (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+
+/*
+ * This function adjusts the timer as needed as a result of the clock
+ * being set. It should only be called for absolute timers, and then
+ * under the abs_list lock. It computes the time difference and sets
+ * the new jiffies value in the timer. It also updates the timers
+ * reference wall_to_monotonic value. It is complicated by the fact
+ * that tstojiffies() only handles positive times and it needs to work
+ * with both positive and negative times. Also, for negative offsets,
+ * we need to defeat the res round up.
+ *
+ * Return is true if there is a new time, else false.
+ */
+static long add_clockset_delta(struct k_itimer *timr,
+ struct timespec *new_wall_to)
+{
+ struct timespec delta;
+ int sign = 0;
+ u64 exp;
+
+ set_normalized_timespec(&delta,
+ new_wall_to->tv_sec -
+ timr->it.real.wall_to_prev.tv_sec,
+ new_wall_to->tv_nsec -
+ timr->it.real.wall_to_prev.tv_nsec);
+ if (likely(!(delta.tv_sec | delta.tv_nsec)))
+ return 0;
+ if (delta.tv_sec < 0) {
+ set_normalized_timespec(&delta,
+ -delta.tv_sec,
+ 1 - delta.tv_nsec -
+ posix_clocks[timr->it_clock].res);
+ sign++;
+ }
+ tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
+ timr->it.real.wall_to_prev = *new_wall_to;
+ timr->it.real.timer.expires += (sign ? -exp : exp);
+ return 1;
+}
+
+static void remove_from_abslist(struct k_itimer *timr)
+{
+ if (!list_empty(&timr->it.real.abs_timer_entry)) {
+ spin_lock(&abs_list.lock);
+ list_del_init(&timr->it.real.abs_timer_entry);
+ spin_unlock(&abs_list.lock);
+ }
+}
+
+static void schedule_next_timer(struct k_itimer *timr)
+{
+ struct timespec new_wall_to;
+ struct now_struct now;
+ unsigned long seq;
+
+ /*
+ * Set up the timer for the next interval (if there is one).
+ * Note: this code uses the abs_timer_lock to protect
+ * it.real.wall_to_prev and must hold it until exp is set, not exactly
+ * obvious...
+
+ * This function is used for CLOCK_REALTIME* and
+ * CLOCK_MONOTONIC* timers. If we ever want to handle other
+ * CLOCKs, the calling code (do_schedule_next_timer) would need
+ * to pull the "clock" info from the timer and dispatch the
+ * "other" CLOCKs "next timer" code (which, I suppose should
+ * also be added to the k_clock structure).
+ */
+ if (!timr->it.real.incr)
+ return;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ new_wall_to = wall_to_monotonic;
+ posix_get_now(&now);
+ } while (read_seqretry(&xtime_lock, seq));
+
+ if (!list_empty(&timr->it.real.abs_timer_entry)) {
+ spin_lock(&abs_list.lock);
+ add_clockset_delta(timr, &new_wall_to);
+
+ posix_bump_timer(timr, now);
+
+ spin_unlock(&abs_list.lock);
+ } else {
+ posix_bump_timer(timr, now);
+ }
+ timr->it_overrun_last = timr->it_overrun;
+ timr->it_overrun = -1;
+ ++timr->it_requeue_pending;
+ add_timer(&timr->it.real.timer);
+}
+
+/*
+ * This function is exported for use by the signal deliver code. It is
+ * called just prior to the info block being released and passes that
+ * block to us. It's function is to update the overrun entry AND to
+ * restart the timer. It should only be called if the timer is to be
+ * restarted (i.e. we have flagged this in the sys_private entry of the
+ * info block).
+ *
+ * To protect aginst the timer going away while the interrupt is queued,
+ * we require that the it_requeue_pending flag be set.
+ */
+void do_schedule_next_timer(struct siginfo *info)
+{
+ struct k_itimer *timr;
+ unsigned long flags;
+
+ timr = lock_timer(info->si_tid, &flags);
+
+ if (!timr || timr->it_requeue_pending != info->si_sys_private)
+ goto exit;
+
+ if (timr->it_clock < 0) /* CPU clock */
+ posix_cpu_timer_schedule(timr);
+ else
+ schedule_next_timer(timr);
+ info->si_overrun = timr->it_overrun_last;
+exit:
+ if (timr)
+ unlock_timer(timr, flags);
+}
+
+int posix_timer_event(struct k_itimer *timr,int si_private)
+{
+ memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+ timr->sigq->info.si_sys_private = si_private;
+ /*
+ * Send signal to the process that owns this timer.
+
+ * This code assumes that all the possible abs_lists share the
+ * same lock (there is only one list at this time). If this is
+ * not the case, the CLOCK info would need to be used to find
+ * the proper abs list lock.
+ */
+
+ timr->sigq->info.si_signo = timr->it_sigev_signo;
+ timr->sigq->info.si_errno = 0;
+ timr->sigq->info.si_code = SI_TIMER;
+ timr->sigq->info.si_tid = timr->it_id;
+ timr->sigq->info.si_value = timr->it_sigev_value;
+ if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+ if (unlikely(timr->it_process->flags & PF_EXITING)) {
+ timr->it_sigev_notify = SIGEV_SIGNAL;
+ put_task_struct(timr->it_process);
+ timr->it_process = timr->it_process->group_leader;
+ goto group;
+ }
+ return send_sigqueue(timr->it_sigev_signo, timr->sigq,
+ timr->it_process);
+ }
+ else {
+ group:
+ return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+ timr->it_process);
+ }
+}
+EXPORT_SYMBOL_GPL(posix_timer_event);
+
+/*
+ * This function gets called when a POSIX.1b interval timer expires. It
+ * is used as a callback from the kernel internal timer. The
+ * run_timer_list code ALWAYS calls with interrupts on.
+
+ * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ */
+static void posix_timer_fn(unsigned long __data)
+{
+ struct k_itimer *timr = (struct k_itimer *) __data;
+ unsigned long flags;
+ unsigned long seq;
+ struct timespec delta, new_wall_to;
+ u64 exp = 0;
+ int do_notify = 1;
+
+ spin_lock_irqsave(&timr->it_lock, flags);
+ set_timer_inactive(timr);
+ if (!list_empty(&timr->it.real.abs_timer_entry)) {
+ spin_lock(&abs_list.lock);
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ new_wall_to = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+ set_normalized_timespec(&delta,
+ new_wall_to.tv_sec -
+ timr->it.real.wall_to_prev.tv_sec,
+ new_wall_to.tv_nsec -
+ timr->it.real.wall_to_prev.tv_nsec);
+ if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
+ /* do nothing, timer is on time */
+ } else if (delta.tv_sec < 0) {
+ /* do nothing, timer is already late */
+ } else {
+ /* timer is early due to a clock set */
+ tstojiffie(&delta,
+ posix_clocks[timr->it_clock].res,
+ &exp);
+ timr->it.real.wall_to_prev = new_wall_to;
+ timr->it.real.timer.expires += exp;
+ add_timer(&timr->it.real.timer);
+ do_notify = 0;
+ }
+ spin_unlock(&abs_list.lock);
+
+ }
+ if (do_notify) {
+ int si_private=0;
+
+ if (timr->it.real.incr)
+ si_private = ++timr->it_requeue_pending;
+ else {
+ remove_from_abslist(timr);
+ }
+
+ if (posix_timer_event(timr, si_private))
+ /*
+ * signal was not sent because of sig_ignor
+ * we will not get a call back to restart it AND
+ * it should be restarted.
+ */
+ schedule_next_timer(timr);
+ }
+ unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
+}
+
+
+static inline struct task_struct * good_sigevent(sigevent_t * event)
+{
+ struct task_struct *rtn = current->group_leader;
+
+ if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+ (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
+ rtn->tgid != current->tgid ||
+ (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ return NULL;
+
+ if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
+ ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+ return NULL;
+
+ return rtn;
+}
+
+void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
+{
+ if ((unsigned) clock_id >= MAX_CLOCKS) {
+ printk("POSIX clock register failed for clock_id %d\n",
+ clock_id);
+ return;
+ }
+
+ posix_clocks[clock_id] = *new_clock;
+}
+EXPORT_SYMBOL_GPL(register_posix_clock);
+
+static struct k_itimer * alloc_posix_timer(void)
+{
+ struct k_itimer *tmr;
+ tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL);
+ if (!tmr)
+ return tmr;
+ memset(tmr, 0, sizeof (struct k_itimer));
+ if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+ kmem_cache_free(posix_timers_cache, tmr);
+ tmr = NULL;
+ }
+ return tmr;
+}
+
+#define IT_ID_SET 1
+#define IT_ID_NOT_SET 0
+static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+{
+ if (it_id_set) {
+ unsigned long flags;
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&posix_timers_id, tmr->it_id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ }
+ sigqueue_free(tmr->sigq);
+ if (unlikely(tmr->it_process) &&
+ tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(tmr->it_process);
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
+/* Create a POSIX.1b interval timer. */
+
+asmlinkage long
+sys_timer_create(clockid_t which_clock,
+ struct sigevent __user *timer_event_spec,
+ timer_t __user * created_timer_id)
+{
+ int error = 0;
+ struct k_itimer *new_timer = NULL;
+ int new_timer_id;
+ struct task_struct *process = NULL;
+ unsigned long flags;
+ sigevent_t event;
+ int it_id_set = IT_ID_NOT_SET;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+
+ new_timer = alloc_posix_timer();
+ if (unlikely(!new_timer))
+ return -EAGAIN;
+
+ spin_lock_init(&new_timer->it_lock);
+ retry:
+ if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
+ error = -EAGAIN;
+ goto out;
+ }
+ spin_lock_irq(&idr_lock);
+ error = idr_get_new(&posix_timers_id,
+ (void *) new_timer,
+ &new_timer_id);
+ spin_unlock_irq(&idr_lock);
+ if (error == -EAGAIN)
+ goto retry;
+ else if (error) {
+ /*
+ * Wierd looking, but we return EAGAIN if the IDR is
+ * full (proper POSIX return value for this)
+ */
+ error = -EAGAIN;
+ goto out;
+ }
+
+ it_id_set = IT_ID_SET;
+ new_timer->it_id = (timer_t) new_timer_id;
+ new_timer->it_clock = which_clock;
+ new_timer->it_overrun = -1;
+ error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+ if (error)
+ goto out;
+
+ /*
+ * return the timer_id now. The next step is hard to
+ * back out if there is an error.
+ */
+ if (copy_to_user(created_timer_id,
+ &new_timer_id, sizeof (new_timer_id))) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (timer_event_spec) {
+ if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+ error = -EFAULT;
+ goto out;
+ }
+ new_timer->it_sigev_notify = event.sigev_notify;
+ new_timer->it_sigev_signo = event.sigev_signo;
+ new_timer->it_sigev_value = event.sigev_value;
+
+ read_lock(&tasklist_lock);
+ if ((process = good_sigevent(&event))) {
+ /*
+ * We may be setting up this process for another
+ * thread. It may be exiting. To catch this
+ * case the we check the PF_EXITING flag. If
+ * the flag is not set, the siglock will catch
+ * him before it is too late (in exit_itimers).
+ *
+ * The exec case is a bit more invloved but easy
+ * to code. If the process is in our thread
+ * group (and it must be or we would not allow
+ * it here) and is doing an exec, it will cause
+ * us to be killed. In this case it will wait
+ * for us to die which means we can finish this
+ * linkage with our last gasp. I.e. no code :)
+ */
+ spin_lock_irqsave(&process->sighand->siglock, flags);
+ if (!(process->flags & PF_EXITING)) {
+ new_timer->it_process = process;
+ list_add(&new_timer->list,
+ &process->signal->posix_timers);
+ spin_unlock_irqrestore(&process->sighand->siglock, flags);
+ if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ get_task_struct(process);
+ } else {
+ spin_unlock_irqrestore(&process->sighand->siglock, flags);
+ process = NULL;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ if (!process) {
+ error = -EINVAL;
+ goto out;
+ }
+ } else {
+ new_timer->it_sigev_notify = SIGEV_SIGNAL;
+ new_timer->it_sigev_signo = SIGALRM;
+ new_timer->it_sigev_value.sival_int = new_timer->it_id;
+ process = current->group_leader;
+ spin_lock_irqsave(&process->sighand->siglock, flags);
+ new_timer->it_process = process;
+ list_add(&new_timer->list, &process->signal->posix_timers);
+ spin_unlock_irqrestore(&process->sighand->siglock, flags);
+ }
+
+ /*
+ * In the case of the timer belonging to another task, after
+ * the task is unlocked, the timer is owned by the other task
+ * and may cease to exist at any time. Don't use or modify
+ * new_timer after the unlock call.
+ */
+
+out:
+ if (error)
+ release_posix_timer(new_timer, it_id_set);
+
+ return error;
+}
+
+/*
+ * good_timespec
+ *
+ * This function checks the elements of a timespec structure.
+ *
+ * Arguments:
+ * ts : Pointer to the timespec structure to check
+ *
+ * Return value:
+ * If a NULL pointer was passed in, or the tv_nsec field was less than 0
+ * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
+ * this function returns 0. Otherwise it returns 1.
+ */
+static int good_timespec(const struct timespec *ts)
+{
+ if ((!ts) || (ts->tv_sec < 0) ||
+ ((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
+ return 0;
+ return 1;
+}
+
+/*
+ * Locking issues: We need to protect the result of the id look up until
+ * we get the timer locked down so it is not deleted under us. The
+ * removal is done under the idr spinlock so we use that here to bridge
+ * the find to the timer lock. To avoid a dead lock, the timer id MUST
+ * be release with out holding the timer lock.
+ */
+static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
+{
+ struct k_itimer *timr;
+ /*
+ * Watch out here. We do a irqsave on the idr_lock and pass the
+ * flags part over to the timer lock. Must not let interrupts in
+ * while we are moving the lock.
+ */
+
+ spin_lock_irqsave(&idr_lock, *flags);
+ timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+ if (timr) {
+ spin_lock(&timr->it_lock);
+ spin_unlock(&idr_lock);
+
+ if ((timr->it_id != timer_id) || !(timr->it_process) ||
+ timr->it_process->tgid != current->tgid) {
+ unlock_timer(timr, *flags);
+ timr = NULL;
+ }
+ } else
+ spin_unlock_irqrestore(&idr_lock, *flags);
+
+ return timr;
+}
+
+/*
+ * Get the time remaining on a POSIX.1b interval timer. This function
+ * is ALWAYS called with spin_lock_irq on the timer, thus it must not
+ * mess with irq.
+ *
+ * We have a couple of messes to clean up here. First there is the case
+ * of a timer that has a requeue pending. These timers should appear to
+ * be in the timer list with an expiry as if we were to requeue them
+ * now.
+ *
+ * The second issue is the SIGEV_NONE timer which may be active but is
+ * not really ever put in the timer list (to save system resources).
+ * This timer may be expired, and if so, we will do it here. Otherwise
+ * it is the same as a requeue pending timer WRT to what we should
+ * report.
+ */
+static void
+common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+{
+ unsigned long expires;
+ struct now_struct now;
+
+ do
+ expires = timr->it.real.timer.expires;
+ while ((volatile long) (timr->it.real.timer.expires) != expires);
+
+ posix_get_now(&now);
+
+ if (expires &&
+ ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
+ !timr->it.real.incr &&
+ posix_time_before(&timr->it.real.timer, &now))
+ timr->it.real.timer.expires = expires = 0;
+ if (expires) {
+ if (timr->it_requeue_pending & REQUEUE_PENDING ||
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+ posix_bump_timer(timr, now);
+ expires = timr->it.real.timer.expires;
+ }
+ else
+ if (!timer_pending(&timr->it.real.timer))
+ expires = 0;
+ if (expires)
+ expires -= now.jiffies;
+ }
+ jiffies_to_timespec(expires, &cur_setting->it_value);
+ jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
+
+ if (cur_setting->it_value.tv_sec < 0) {
+ cur_setting->it_value.tv_nsec = 1;
+ cur_setting->it_value.tv_sec = 0;
+ }
+}
+
+/* Get the time remaining on a POSIX.1b interval timer. */
+asmlinkage long
+sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
+{
+ struct k_itimer *timr;
+ struct itimerspec cur_setting;
+ unsigned long flags;
+
+ timr = lock_timer(timer_id, &flags);
+ if (!timr)
+ return -EINVAL;
+
+ CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+
+ unlock_timer(timr, flags);
+
+ if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ return -EFAULT;
+
+ return 0;
+}
+/*
+ * Get the number of overruns of a POSIX.1b interval timer. This is to
+ * be the overrun of the timer last delivered. At the same time we are
+ * accumulating overruns on the next timer. The overrun is frozen when
+ * the signal is delivered, either at the notify time (if the info block
+ * is not queued) or at the actual delivery time (as we are informed by
+ * the call back to do_schedule_next_timer(). So all we need to do is
+ * to pick up the frozen overrun.
+ */
+
+asmlinkage long
+sys_timer_getoverrun(timer_t timer_id)
+{
+ struct k_itimer *timr;
+ int overrun;
+ long flags;
+
+ timr = lock_timer(timer_id, &flags);
+ if (!timr)
+ return -EINVAL;
+
+ overrun = timr->it_overrun_last;
+ unlock_timer(timr, flags);
+
+ return overrun;
+}
+/*
+ * Adjust for absolute time
+ *
+ * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
+ * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
+ * what ever clock he is using.
+ *
+ * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
+ * time to it to get the proper time for the timer.
+ */
+static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
+ int abs, u64 *exp, struct timespec *wall_to)
+{
+ struct timespec now;
+ struct timespec oc = *tp;
+ u64 jiffies_64_f;
+ int rtn =0;
+
+ if (abs) {
+ /*
+ * The mask pick up the 4 basic clocks
+ */
+ if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
+ jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
+ &now, wall_to);
+ /*
+ * If we are doing a MONOTONIC clock
+ */
+ if((clock - &posix_clocks[0]) & CLOCKS_MONO){
+ now.tv_sec += wall_to->tv_sec;
+ now.tv_nsec += wall_to->tv_nsec;
+ }
+ } else {
+ /*
+ * Not one of the basic clocks
+ */
+ clock->clock_get(clock - posix_clocks, &now);
+ jiffies_64_f = get_jiffies_64();
+ }
+ /*
+ * Take away now to get delta
+ */
+ oc.tv_sec -= now.tv_sec;
+ oc.tv_nsec -= now.tv_nsec;
+ /*
+ * Normalize...
+ */
+ while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
+ oc.tv_nsec -= NSEC_PER_SEC;
+ oc.tv_sec++;
+ }
+ while ((oc.tv_nsec) < 0) {
+ oc.tv_nsec += NSEC_PER_SEC;
+ oc.tv_sec--;
+ }
+ }else{
+ jiffies_64_f = get_jiffies_64();
+ }
+ /*
+ * Check if the requested time is prior to now (if so set now)
+ */
+ if (oc.tv_sec < 0)
+ oc.tv_sec = oc.tv_nsec = 0;
+
+ if (oc.tv_sec | oc.tv_nsec)
+ set_normalized_timespec(&oc, oc.tv_sec,
+ oc.tv_nsec + clock->res);
+ tstojiffie(&oc, clock->res, exp);
+
+ /*
+ * Check if the requested time is more than the timer code
+ * can handle (if so we error out but return the value too).
+ */
+ if (*exp > ((u64)MAX_JIFFY_OFFSET))
+ /*
+ * This is a considered response, not exactly in
+ * line with the standard (in fact it is silent on
+ * possible overflows). We assume such a large
+ * value is ALMOST always a programming error and
+ * try not to compound it by setting a really dumb
+ * value.
+ */
+ rtn = -EINVAL;
+ /*
+ * return the actual jiffies expire time, full 64 bits
+ */
+ *exp += jiffies_64_f;
+ return rtn;
+}
+
+/* Set a POSIX.1b interval timer. */
+/* timr->it_lock is taken. */
+static inline int
+common_timer_set(struct k_itimer *timr, int flags,
+ struct itimerspec *new_setting, struct itimerspec *old_setting)
+{
+ struct k_clock *clock = &posix_clocks[timr->it_clock];
+ u64 expire_64;
+
+ if (old_setting)
+ common_timer_get(timr, old_setting);
+
+ /* disable the timer */
+ timr->it.real.incr = 0;
+ /*
+ * careful here. If smp we could be in the "fire" routine which will
+ * be spinning as we hold the lock. But this is ONLY an SMP issue.
+ */
+#ifdef CONFIG_SMP
+ if (timer_active(timr) && !del_timer(&timr->it.real.timer))
+ /*
+ * It can only be active if on an other cpu. Since
+ * we have cleared the interval stuff above, it should
+ * clear once we release the spin lock. Of course once
+ * we do that anything could happen, including the
+ * complete melt down of the timer. So return with
+ * a "retry" exit status.
+ */
+ return TIMER_RETRY;
+
+ set_timer_inactive(timr);
+#else
+ del_timer(&timr->it.real.timer);
+#endif
+ remove_from_abslist(timr);
+
+ timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
+ ~REQUEUE_PENDING;
+ timr->it_overrun_last = 0;
+ timr->it_overrun = -1;
+ /*
+ *switch off the timer when it_value is zero
+ */
+ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
+ timr->it.real.timer.expires = 0;
+ return 0;
+ }
+
+ if (adjust_abs_time(clock,
+ &new_setting->it_value, flags & TIMER_ABSTIME,
+ &expire_64, &(timr->it.real.wall_to_prev))) {
+ return -EINVAL;
+ }
+ timr->it.real.timer.expires = (unsigned long)expire_64;
+ tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
+ timr->it.real.incr = (unsigned long)expire_64;
+
+ /*
+ * We do not even queue SIGEV_NONE timers! But we do put them
+ * in the abs list so we can do that right.
+ */
+ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
+ add_timer(&timr->it.real.timer);
+
+ if (flags & TIMER_ABSTIME && clock->abs_struct) {
+ spin_lock(&clock->abs_struct->lock);
+ list_add_tail(&(timr->it.real.abs_timer_entry),
+ &(clock->abs_struct->list));
+ spin_unlock(&clock->abs_struct->lock);
+ }
+ return 0;
+}
+
+/* Set a POSIX.1b interval timer */
+asmlinkage long
+sys_timer_settime(timer_t timer_id, int flags,
+ const struct itimerspec __user *new_setting,
+ struct itimerspec __user *old_setting)
+{
+ struct k_itimer *timr;
+ struct itimerspec new_spec, old_spec;
+ int error = 0;
+ long flag;
+ struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+
+ if (!new_setting)
+ return -EINVAL;
+
+ if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ return -EFAULT;
+
+ if ((!good_timespec(&new_spec.it_interval)) ||
+ (!good_timespec(&new_spec.it_value)))
+ return -EINVAL;
+retry:
+ timr = lock_timer(timer_id, &flag);
+ if (!timr)
+ return -EINVAL;
+
+ error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+ (timr, flags, &new_spec, rtn));
+
+ unlock_timer(timr, flag);
+ if (error == TIMER_RETRY) {
+ rtn = NULL; // We already got the old time...
+ goto retry;
+ }
+
+ if (old_setting && !error && copy_to_user(old_setting,
+ &old_spec, sizeof (old_spec)))
+ error = -EFAULT;
+
+ return error;
+}
+
+static inline int common_timer_del(struct k_itimer *timer)
+{
+ timer->it.real.incr = 0;
+#ifdef CONFIG_SMP
+ if (timer_active(timer) && !del_timer(&timer->it.real.timer))
+ /*
+ * It can only be active if on an other cpu. Since
+ * we have cleared the interval stuff above, it should
+ * clear once we release the spin lock. Of course once
+ * we do that anything could happen, including the
+ * complete melt down of the timer. So return with
+ * a "retry" exit status.
+ */
+ return TIMER_RETRY;
+#else
+ del_timer(&timer->it.real.timer);
+#endif
+ remove_from_abslist(timer);
+
+ return 0;
+}
+
+static inline int timer_delete_hook(struct k_itimer *timer)
+{
+ return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+}
+
+/* Delete a POSIX.1b interval timer. */
+asmlinkage long
+sys_timer_delete(timer_t timer_id)
+{
+ struct k_itimer *timer;
+ long flags;
+
+#ifdef CONFIG_SMP
+ int error;
+retry_delete:
+#endif
+ timer = lock_timer(timer_id, &flags);
+ if (!timer)
+ return -EINVAL;
+
+#ifdef CONFIG_SMP
+ error = timer_delete_hook(timer);
+
+ if (error == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+#else
+ timer_delete_hook(timer);
+#endif
+ spin_lock(&current->sighand->siglock);
+ list_del(&timer->list);
+ spin_unlock(&current->sighand->siglock);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ if (timer->it_process) {
+ if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+ }
+ unlock_timer(timer, flags);
+ release_posix_timer(timer, IT_ID_SET);
+ return 0;
+}
+/*
+ * return timer owned by the process, used by exit_itimers
+ */
+static inline void itimer_delete(struct k_itimer *timer)
+{
+ unsigned long flags;
+
+#ifdef CONFIG_SMP
+ int error;
+retry_delete:
+#endif
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+ error = timer_delete_hook(timer);
+
+ if (error == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+#else
+ timer_delete_hook(timer);
+#endif
+ list_del(&timer->list);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ if (timer->it_process) {
+ if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+ }
+ unlock_timer(timer, flags);
+ release_posix_timer(timer, IT_ID_SET);
+}
+
+/*
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
+ */
+void exit_itimers(struct signal_struct *sig)
+{
+ struct k_itimer *tmr;
+
+ while (!list_empty(&sig->posix_timers)) {
+ tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+ itimer_delete(tmr);
+ }
+}
+
+/*
+ * And now for the "clock" calls
+ *
+ * These functions are called both from timer functions (with the timer
+ * spin_lock_irq() held and from clock calls with no locking. They must
+ * use the save flags versions of locks.
+ */
+
+/*
+ * We do ticks here to avoid the irq lock ( they take sooo long).
+ * The seqlock is great here. Since we a reader, we don't really care
+ * if we are interrupted since we don't take lock that will stall us or
+ * any other cpu. Voila, no irq lock is needed.
+ *
+ */
+
+static u64 do_posix_clock_monotonic_gettime_parts(
+ struct timespec *tp, struct timespec *mo)
+{
+ u64 jiff;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ getnstimeofday(tp);
+ *mo = wall_to_monotonic;
+ jiff = jiffies_64;
+
+ } while(read_seqretry(&xtime_lock, seq));
+
+ return jiff;
+}
+
+static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
+{
+ struct timespec wall_to_mono;
+
+ do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
+
+ tp->tv_sec += wall_to_mono.tv_sec;
+ tp->tv_nsec += wall_to_mono.tv_nsec;
+
+ if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
+ tp->tv_nsec -= NSEC_PER_SEC;
+ tp->tv_sec++;
+ }
+ return 0;
+}
+
+int do_posix_clock_monotonic_gettime(struct timespec *tp)
+{
+ return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
+}
+
+int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
+
+int do_posix_clock_notimer_create(struct k_itimer *timer)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
+
+int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
+{
+#ifndef ENOTSUP
+ return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
+#else /* parisc does define it separately. */
+ return -ENOTSUP;
+#endif
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
+
+asmlinkage long
+sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
+{
+ struct timespec new_tp;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+ if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ return -EFAULT;
+
+ return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+}
+
+asmlinkage long
+sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
+{
+ struct timespec kernel_tp;
+ int error;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+ error = CLOCK_DISPATCH(which_clock, clock_get,
+ (which_clock, &kernel_tp));
+ if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ error = -EFAULT;
+
+ return error;
+
+}
+
+asmlinkage long
+sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
+{
+ struct timespec rtn_tp;
+ int error;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+
+ error = CLOCK_DISPATCH(which_clock, clock_getres,
+ (which_clock, &rtn_tp));
+
+ if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+ error = -EFAULT;
+ }
+
+ return error;
+}
+
+static void nanosleep_wake_up(unsigned long __data)
+{
+ struct task_struct *p = (struct task_struct *) __data;
+
+ wake_up_process(p);
+}
+
+/*
+ * The standard says that an absolute nanosleep call MUST wake up at
+ * the requested time in spite of clock settings. Here is what we do:
+ * For each nanosleep call that needs it (only absolute and not on
+ * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
+ * into the "nanosleep_abs_list". All we need is the task_struct pointer.
+ * When ever the clock is set we just wake up all those tasks. The rest
+ * is done by the while loop in clock_nanosleep().
+ *
+ * On locking, clock_was_set() is called from update_wall_clock which
+ * holds (or has held for it) a write_lock_irq( xtime_lock) and is
+ * called from the timer bh code. Thus we need the irq save locks.
+ *
+ * Also, on the call from update_wall_clock, that is done as part of a
+ * softirq thing. We don't want to delay the system that much (possibly
+ * long list of timers to fix), so we defer that work to keventd.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
+static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
+
+static DECLARE_MUTEX(clock_was_set_lock);
+
+void clock_was_set(void)
+{
+ struct k_itimer *timr;
+ struct timespec new_wall_to;
+ LIST_HEAD(cws_list);
+ unsigned long seq;
+
+
+ if (unlikely(in_interrupt())) {
+ schedule_work(&clock_was_set_work);
+ return;
+ }
+ wake_up_all(&nanosleep_abs_wqueue);
+
+ /*
+ * Check if there exist TIMER_ABSTIME timers to correct.
+ *
+ * Notes on locking: This code is run in task context with irq
+ * on. We CAN be interrupted! All other usage of the abs list
+ * lock is under the timer lock which holds the irq lock as
+ * well. We REALLY don't want to scan the whole list with the
+ * interrupt system off, AND we would like a sequence lock on
+ * this code as well. Since we assume that the clock will not
+ * be set often, it seems ok to take and release the irq lock
+ * for each timer. In fact add_timer will do this, so this is
+ * not an issue. So we know when we are done, we will move the
+ * whole list to a new location. Then as we process each entry,
+ * we will move it to the actual list again. This way, when our
+ * copy is empty, we are done. We are not all that concerned
+ * about preemption so we will use a semaphore lock to protect
+ * aginst reentry. This way we will not stall another
+ * processor. It is possible that this may delay some timers
+ * that should have expired, given the new clock, but even this
+ * will be minimal as we will always update to the current time,
+ * even if it was set by a task that is waiting for entry to
+ * this code. Timers that expire too early will be caught by
+ * the expire code and restarted.
+
+ * Absolute timers that repeat are left in the abs list while
+ * waiting for the task to pick up the signal. This means we
+ * may find timers that are not in the "add_timer" list, but are
+ * in the abs list. We do the same thing for these, save
+ * putting them back in the "add_timer" list. (Note, these are
+ * left in the abs list mainly to indicate that they are
+ * ABSOLUTE timers, a fact that is used by the re-arm code, and
+ * for which we have no other flag.)
+
+ */
+
+ down(&clock_was_set_lock);
+ spin_lock_irq(&abs_list.lock);
+ list_splice_init(&abs_list.list, &cws_list);
+ spin_unlock_irq(&abs_list.lock);
+ do {
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ new_wall_to = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ spin_lock_irq(&abs_list.lock);
+ if (list_empty(&cws_list)) {
+ spin_unlock_irq(&abs_list.lock);
+ break;
+ }
+ timr = list_entry(cws_list.next, struct k_itimer,
+ it.real.abs_timer_entry);
+
+ list_del_init(&timr->it.real.abs_timer_entry);
+ if (add_clockset_delta(timr, &new_wall_to) &&
+ del_timer(&timr->it.real.timer)) /* timer run yet? */
+ add_timer(&timr->it.real.timer);
+ list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
+ spin_unlock_irq(&abs_list.lock);
+ } while (1);
+
+ up(&clock_was_set_lock);
+}
+
+long clock_nanosleep_restart(struct restart_block *restart_block);
+
+asmlinkage long
+sys_clock_nanosleep(clockid_t which_clock, int flags,
+ const struct timespec __user *rqtp,
+ struct timespec __user *rmtp)
+{
+ struct timespec t;
+ struct restart_block *restart_block =
+ &(current_thread_info()->restart_block);
+ int ret;
+
+ if (invalid_clockid(which_clock))
+ return -EINVAL;
+
+ if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ return -EFAULT;
+
+ if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+ return -EINVAL;
+
+ /*
+ * Do this here as nsleep function does not have the real address.
+ */
+ restart_block->arg1 = (unsigned long)rmtp;
+
+ ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
+
+ if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
+ copy_to_user(rmtp, &t, sizeof (t)))
+ return -EFAULT;
+ return ret;
+}
+
+
+static int common_nsleep(clockid_t which_clock,
+ int flags, struct timespec *tsave)
+{
+ struct timespec t, dum;
+ struct timer_list new_timer;
+ DECLARE_WAITQUEUE(abs_wqueue, current);
+ u64 rq_time = (u64)0;
+ s64 left;
+ int abs;
+ struct restart_block *restart_block =
+ &current_thread_info()->restart_block;
+
+ abs_wqueue.flags = 0;
+ init_timer(&new_timer);
+ new_timer.expires = 0;
+ new_timer.data = (unsigned long) current;
+ new_timer.function = nanosleep_wake_up;
+ abs = flags & TIMER_ABSTIME;
+
+ if (restart_block->fn == clock_nanosleep_restart) {
+ /*
+ * Interrupted by a non-delivered signal, pick up remaining
+ * time and continue. Remaining time is in arg2 & 3.
+ */
+ restart_block->fn = do_no_restart_syscall;
+
+ rq_time = restart_block->arg3;
+ rq_time = (rq_time << 32) + restart_block->arg2;
+ if (!rq_time)
+ return -EINTR;
+ left = rq_time - get_jiffies_64();
+ if (left <= (s64)0)
+ return 0; /* Already passed */
+ }
+
+ if (abs && (posix_clocks[which_clock].clock_get !=
+ posix_clocks[CLOCK_MONOTONIC].clock_get))
+ add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
+
+ do {
+ t = *tsave;
+ if (abs || !rq_time) {
+ adjust_abs_time(&posix_clocks[which_clock], &t, abs,
+ &rq_time, &dum);
+ }
+
+ left = rq_time - get_jiffies_64();
+ if (left >= (s64)MAX_JIFFY_OFFSET)
+ left = (s64)MAX_JIFFY_OFFSET;
+ if (left < (s64)0)
+ break;
+
+ new_timer.expires = jiffies + left;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_timer(&new_timer);
+
+ schedule();
+
+ del_timer_sync(&new_timer);
+ left = rq_time - get_jiffies_64();
+ } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
+
+ if (abs_wqueue.task_list.next)
+ finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
+
+ if (left > (s64)0) {
+
+ /*
+ * Always restart abs calls from scratch to pick up any
+ * clock shifting that happened while we are away.
+ */
+ if (abs)
+ return -ERESTARTNOHAND;
+
+ left *= TICK_NSEC;
+ tsave->tv_sec = div_long_long_rem(left,
+ NSEC_PER_SEC,
+ &tsave->tv_nsec);
+ /*
+ * Restart works by saving the time remaing in
+ * arg2 & 3 (it is 64-bits of jiffies). The other
+ * info we need is the clock_id (saved in arg0).
+ * The sys_call interface needs the users
+ * timespec return address which _it_ saves in arg1.
+ * Since we have cast the nanosleep call to a clock_nanosleep
+ * both can be restarted with the same code.
+ */
+ restart_block->fn = clock_nanosleep_restart;
+ restart_block->arg0 = which_clock;
+ /*
+ * Caller sets arg1
+ */
+ restart_block->arg2 = rq_time & 0xffffffffLL;
+ restart_block->arg3 = rq_time >> 32;
+
+ return -ERESTART_RESTARTBLOCK;
+ }
+
+ return 0;
+}
+/*
+ * This will restart clock_nanosleep.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ struct timespec t;
+ int ret = common_nsleep(restart_block->arg0, 0, &t);
+
+ if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
+ copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
+ sizeof (t)))
+ return -EFAULT;
+ return ret;
+}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
new file mode 100644
index 000000000000..696387ffe49c
--- /dev/null
+++ b/kernel/power/Kconfig
@@ -0,0 +1,74 @@
+config PM
+ bool "Power Management support"
+ ---help---
+ "Power Management" means that parts of your computer are shut
+ off or put into a power conserving "sleep" mode if they are not
+ being used. There are two competing standards for doing this: APM
+ and ACPI. If you want to use either one, say Y here and then also
+ to the requisite support below.
+
+ Power Management is most important for battery powered laptop
+ computers; if you have a laptop, check out the Linux Laptop home
+ page on the WWW at <http://www.linux-on-laptops.com/> or
+ Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+ and the Battery Powered Linux mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ Note that, even if you say N here, Linux on the x86 architecture
+ will issue the hlt instruction if nothing is to be done, thereby
+ sending the processor to sleep and saving power.
+
+config PM_DEBUG
+ bool "Power Management Debug Support"
+ depends on PM
+ ---help---
+ This option enables verbose debugging support in the Power Management
+ code. This is helpful when debugging and reporting various PM bugs,
+ like suspend support.
+
+config SOFTWARE_SUSPEND
+ bool "Software Suspend (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && PM && SWAP
+ ---help---
+ Enable the possibility of suspending the machine.
+ It doesn't need APM.
+ You may suspend your machine by 'swsusp' or 'shutdown -z <time>'
+ (patch for sysvinit needed).
+
+ It creates an image which is saved in your active swap. Upon next
+ boot, pass the 'resume=/dev/swappartition' argument to the kernel to
+ have it detect the saved image, restore memory state from it, and
+ continue to run as before. If you do not want the previous state to
+ be reloaded, then use the 'noresume' kernel argument. However, note
+ that your partitions will be fsck'd and you must re-mkswap your swap
+ partitions. It does not work with swap files.
+
+ Right now you may boot without resuming and then later resume but
+ in meantime you cannot use those swap partitions/files which were
+ involved in suspending. Also in this case there is a risk that buffers
+ on disk won't match with saved ones.
+
+ For more information take a look at <file:Documentation/power/swsusp.txt>.
+
+config PM_STD_PARTITION
+ string "Default resume partition"
+ depends on SOFTWARE_SUSPEND
+ default ""
+ ---help---
+ The default resume partition is the partition that the suspend-
+ to-disk implementation will look for a suspended disk image.
+
+ The partition specified here will be different for almost every user.
+ It should be a valid swap partition (at least for now) that is turned
+ on before suspending.
+
+ The partition specified can be overridden by specifying:
+
+ resume=/dev/<other device>
+
+ which will set the resume partition to the device specified.
+
+ Note there is currently not a way to specify which device to save the
+ suspended image to. It will simply pick the first available swap
+ device.
+
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
new file mode 100644
index 000000000000..fbdc634135a7
--- /dev/null
+++ b/kernel/power/Makefile
@@ -0,0 +1,11 @@
+
+ifeq ($(CONFIG_PM_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
+
+swsusp-smp-$(CONFIG_SMP) += smp.o
+
+obj-y := main.o process.o console.o pm.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o
+
+obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
new file mode 100644
index 000000000000..7ff375e7c95f
--- /dev/null
+++ b/kernel/power/console.c
@@ -0,0 +1,58 @@
+/*
+ * drivers/power/process.c - Functions for saving/restoring console.
+ *
+ * Originally from swsusp.
+ */
+
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/console.h>
+#include "power.h"
+
+static int new_loglevel = 10;
+static int orig_loglevel;
+#ifdef SUSPEND_CONSOLE
+static int orig_fgconsole, orig_kmsg;
+#endif
+
+int pm_prepare_console(void)
+{
+ orig_loglevel = console_loglevel;
+ console_loglevel = new_loglevel;
+
+#ifdef SUSPEND_CONSOLE
+ acquire_console_sem();
+
+ orig_fgconsole = fg_console;
+
+ if (vc_allocate(SUSPEND_CONSOLE)) {
+ /* we can't have a free VC for now. Too bad,
+ * we don't want to mess the screen for now. */
+ release_console_sem();
+ return 1;
+ }
+
+ set_console(SUSPEND_CONSOLE);
+ release_console_sem();
+
+ if (vt_waitactive(SUSPEND_CONSOLE)) {
+ pr_debug("Suspend: Can't switch VCs.");
+ return 1;
+ }
+ orig_kmsg = kmsg_redirect;
+ kmsg_redirect = SUSPEND_CONSOLE;
+#endif
+ return 0;
+}
+
+void pm_restore_console(void)
+{
+ console_loglevel = orig_loglevel;
+#ifdef SUSPEND_CONSOLE
+ acquire_console_sem();
+ set_console(orig_fgconsole);
+ release_console_sem();
+ kmsg_redirect = orig_kmsg;
+#endif
+ return;
+}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
new file mode 100644
index 000000000000..02b6764034dc
--- /dev/null
+++ b/kernel/power/disk.c
@@ -0,0 +1,431 @@
+/*
+ * kernel/power/disk.c - Suspend-to-disk support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include "power.h"
+
+
+extern suspend_disk_method_t pm_disk_mode;
+extern struct pm_ops * pm_ops;
+
+extern int swsusp_suspend(void);
+extern int swsusp_write(void);
+extern int swsusp_check(void);
+extern int swsusp_read(void);
+extern void swsusp_close(void);
+extern int swsusp_resume(void);
+extern int swsusp_free(void);
+
+
+static int noresume = 0;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+
+/**
+ * power_down - Shut machine down for hibernate.
+ * @mode: Suspend-to-disk mode
+ *
+ * Use the platform driver, if configured so, and return gracefully if it
+ * fails.
+ * Otherwise, try to power off and reboot. If they fail, halt the machine,
+ * there ain't no turning back.
+ */
+
+static void power_down(suspend_disk_method_t mode)
+{
+ unsigned long flags;
+ int error = 0;
+
+ local_irq_save(flags);
+ switch(mode) {
+ case PM_DISK_PLATFORM:
+ device_shutdown();
+ error = pm_ops->enter(PM_SUSPEND_DISK);
+ break;
+ case PM_DISK_SHUTDOWN:
+ printk("Powering off system\n");
+ device_shutdown();
+ machine_power_off();
+ break;
+ case PM_DISK_REBOOT:
+ device_shutdown();
+ machine_restart(NULL);
+ break;
+ }
+ machine_halt();
+ /* Valid image is on the disk, if we continue we risk serious data corruption
+ after resume. */
+ printk(KERN_CRIT "Please power me down manually\n");
+ while(1);
+}
+
+
+static int in_suspend __nosavedata = 0;
+
+
+/**
+ * free_some_memory - Try to free as much memory as possible
+ *
+ * ... but do not OOM-kill anyone
+ *
+ * Notice: all userland should be stopped at this point, or
+ * livelock is possible.
+ */
+
+static void free_some_memory(void)
+{
+ unsigned int i = 0;
+ unsigned int tmp;
+ unsigned long pages = 0;
+ char *p = "-\\|/";
+
+ printk("Freeing memory... ");
+ while ((tmp = shrink_all_memory(10000))) {
+ pages += tmp;
+ printk("\b%c", p[i]);
+ i++;
+ if (i > 3)
+ i = 0;
+ }
+ printk("\bdone (%li pages freed)\n", pages);
+}
+
+
+static inline void platform_finish(void)
+{
+ if (pm_disk_mode == PM_DISK_PLATFORM) {
+ if (pm_ops && pm_ops->finish)
+ pm_ops->finish(PM_SUSPEND_DISK);
+ }
+}
+
+static void finish(void)
+{
+ device_resume();
+ platform_finish();
+ enable_nonboot_cpus();
+ thaw_processes();
+ pm_restore_console();
+}
+
+
+static int prepare_processes(void)
+{
+ int error;
+
+ pm_prepare_console();
+
+ sys_sync();
+
+ if (freeze_processes()) {
+ error = -EBUSY;
+ return error;
+ }
+
+ if (pm_disk_mode == PM_DISK_PLATFORM) {
+ if (pm_ops && pm_ops->prepare) {
+ if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
+ return error;
+ }
+ }
+
+ /* Free memory before shutting down devices. */
+ free_some_memory();
+
+ return 0;
+}
+
+static void unprepare_processes(void)
+{
+ enable_nonboot_cpus();
+ thaw_processes();
+ pm_restore_console();
+}
+
+static int prepare_devices(void)
+{
+ int error;
+
+ disable_nonboot_cpus();
+ if ((error = device_suspend(PMSG_FREEZE))) {
+ printk("Some devices failed to suspend\n");
+ platform_finish();
+ enable_nonboot_cpus();
+ return error;
+ }
+
+ return 0;
+}
+
+/**
+ * pm_suspend_disk - The granpappy of power management.
+ *
+ * If we're going through the firmware, then get it over with quickly.
+ *
+ * If not, then call swsusp to do its thing, then figure out how
+ * to power down the system.
+ */
+
+int pm_suspend_disk(void)
+{
+ int error;
+
+ error = prepare_processes();
+ if (!error) {
+ error = prepare_devices();
+ }
+
+ if (error) {
+ unprepare_processes();
+ return error;
+ }
+
+ pr_debug("PM: Attempting to suspend to disk.\n");
+ if (pm_disk_mode == PM_DISK_FIRMWARE)
+ return pm_ops->enter(PM_SUSPEND_DISK);
+
+ pr_debug("PM: snapshotting memory.\n");
+ in_suspend = 1;
+ if ((error = swsusp_suspend()))
+ goto Done;
+
+ if (in_suspend) {
+ pr_debug("PM: writing image.\n");
+ error = swsusp_write();
+ if (!error)
+ power_down(pm_disk_mode);
+ } else
+ pr_debug("PM: Image restored successfully.\n");
+ swsusp_free();
+ Done:
+ finish();
+ return error;
+}
+
+
+/**
+ * software_resume - Resume from a saved image.
+ *
+ * Called as a late_initcall (so all devices are discovered and
+ * initialized), we call swsusp to see if we have a saved image or not.
+ * If so, we quiesce devices, the restore the saved image. We will
+ * return above (in pm_suspend_disk() ) if everything goes well.
+ * Otherwise, we fail gracefully and return to the normally
+ * scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+ int error;
+
+ if (noresume) {
+ /**
+ * FIXME: If noresume is specified, we need to find the partition
+ * and reset it back to normal swap space.
+ */
+ return 0;
+ }
+
+ pr_debug("PM: Checking swsusp image.\n");
+
+ if ((error = swsusp_check()))
+ goto Done;
+
+ pr_debug("PM: Preparing processes for restore.\n");
+
+ if ((error = prepare_processes())) {
+ swsusp_close();
+ goto Cleanup;
+ }
+
+ pr_debug("PM: Reading swsusp image.\n");
+
+ if ((error = swsusp_read()))
+ goto Cleanup;
+
+ pr_debug("PM: Preparing devices for restore.\n");
+
+ if ((error = prepare_devices()))
+ goto Free;
+
+ mb();
+
+ pr_debug("PM: Restoring saved image.\n");
+ swsusp_resume();
+ pr_debug("PM: Restore failed, recovering.n");
+ finish();
+ Free:
+ swsusp_free();
+ Cleanup:
+ unprepare_processes();
+ Done:
+ pr_debug("PM: Resume from disk failed.\n");
+ return 0;
+}
+
+late_initcall(software_resume);
+
+
+static char * pm_disk_modes[] = {
+ [PM_DISK_FIRMWARE] = "firmware",
+ [PM_DISK_PLATFORM] = "platform",
+ [PM_DISK_SHUTDOWN] = "shutdown",
+ [PM_DISK_REBOOT] = "reboot",
+};
+
+/**
+ * disk - Control suspend-to-disk mode
+ *
+ * Suspend-to-disk can be handled in several ways. The greatest
+ * distinction is who writes memory to disk - the firmware or the OS.
+ * If the firmware does it, we assume that it also handles suspending
+ * the system.
+ * If the OS does it, then we have three options for putting the system
+ * to sleep - using the platform driver (e.g. ACPI or other PM registers),
+ * powering off the system or rebooting the system (for testing).
+ *
+ * The system will support either 'firmware' or 'platform', and that is
+ * known a priori (and encoded in pm_ops). But, the user may choose
+ * 'shutdown' or 'reboot' as alternatives.
+ *
+ * show() will display what the mode is currently set to.
+ * store() will accept one of
+ *
+ * 'firmware'
+ * 'platform'
+ * 'shutdown'
+ * 'reboot'
+ *
+ * It will only change to 'firmware' or 'platform' if the system
+ * supports it (as determined from pm_ops->pm_disk_mode).
+ */
+
+static ssize_t disk_show(struct subsystem * subsys, char * buf)
+{
+ return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+}
+
+
+static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+{
+ int error = 0;
+ int i;
+ int len;
+ char *p;
+ suspend_disk_method_t mode = 0;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ down(&pm_sem);
+ for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
+ if (!strncmp(buf, pm_disk_modes[i], len)) {
+ mode = i;
+ break;
+ }
+ }
+ if (mode) {
+ if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+ pm_disk_mode = mode;
+ else {
+ if (pm_ops && pm_ops->enter &&
+ (mode == pm_ops->pm_disk_mode))
+ pm_disk_mode = mode;
+ else
+ error = -EINVAL;
+ }
+ } else
+ error = -EINVAL;
+
+ pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+ pm_disk_modes[mode]);
+ up(&pm_sem);
+ return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct subsystem * subsys, char *buf)
+{
+ return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+ int len;
+ char *p;
+ unsigned int maj, min;
+ int error = -EINVAL;
+ dev_t res;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
+ res = MKDEV(maj,min);
+ if (maj == MAJOR(res) && min == MINOR(res)) {
+ swsusp_resume_device = res;
+ printk("Attempting manual resume\n");
+ noresume = 0;
+ software_resume();
+ }
+ }
+
+ return error >= 0 ? n : error;
+}
+
+power_attr(resume);
+
+static struct attribute * g[] = {
+ &disk_attr.attr,
+ &resume_attr.attr,
+ NULL,
+};
+
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+ return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+ if (noresume)
+ return 1;
+
+ strncpy( resume_file, str, 255 );
+ return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+ noresume = 1;
+ return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
new file mode 100644
index 000000000000..7960ddf04a57
--- /dev/null
+++ b/kernel/power/main.c
@@ -0,0 +1,269 @@
+/*
+ * kernel/power/main.c - PM subsystem core functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ *
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+
+
+#include "power.h"
+
+DECLARE_MUTEX(pm_sem);
+
+struct pm_ops * pm_ops = NULL;
+suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
+
+/**
+ * pm_set_ops - Set the global power method table.
+ * @ops: Pointer to ops structure.
+ */
+
+void pm_set_ops(struct pm_ops * ops)
+{
+ down(&pm_sem);
+ pm_ops = ops;
+ up(&pm_sem);
+}
+
+
+/**
+ * suspend_prepare - Do prep work before entering low-power state.
+ * @state: State we're entering.
+ *
+ * This is common code that is called for each state that we're
+ * entering. Allocate a console, stop all processes, then make sure
+ * the platform can enter the requested state.
+ */
+
+static int suspend_prepare(suspend_state_t state)
+{
+ int error = 0;
+
+ if (!pm_ops || !pm_ops->enter)
+ return -EPERM;
+
+ pm_prepare_console();
+
+ if (freeze_processes()) {
+ error = -EAGAIN;
+ goto Thaw;
+ }
+
+ if (pm_ops->prepare) {
+ if ((error = pm_ops->prepare(state)))
+ goto Thaw;
+ }
+
+ if ((error = device_suspend(PMSG_SUSPEND))) {
+ printk(KERN_ERR "Some devices failed to suspend\n");
+ goto Finish;
+ }
+ return 0;
+ Finish:
+ if (pm_ops->finish)
+ pm_ops->finish(state);
+ Thaw:
+ thaw_processes();
+ pm_restore_console();
+ return error;
+}
+
+
+static int suspend_enter(suspend_state_t state)
+{
+ int error = 0;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if ((error = device_power_down(PMSG_SUSPEND))) {
+ printk(KERN_ERR "Some devices failed to power down\n");
+ goto Done;
+ }
+ error = pm_ops->enter(state);
+ device_power_up();
+ Done:
+ local_irq_restore(flags);
+ return error;
+}
+
+
+/**
+ * suspend_finish - Do final work before exiting suspend sequence.
+ * @state: State we're coming out of.
+ *
+ * Call platform code to clean up, restart processes, and free the
+ * console that we've allocated. This is not called for suspend-to-disk.
+ */
+
+static void suspend_finish(suspend_state_t state)
+{
+ device_resume();
+ if (pm_ops && pm_ops->finish)
+ pm_ops->finish(state);
+ thaw_processes();
+ pm_restore_console();
+}
+
+
+
+
+static char * pm_states[] = {
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+ [PM_SUSPEND_DISK] = "disk",
+ NULL,
+};
+
+
+/**
+ * enter_state - Do common work of entering low-power state.
+ * @state: pm_state structure for state we're entering.
+ *
+ * Make sure we're the only ones trying to enter a sleep state. Fail
+ * if someone has beat us to it, since we don't want anything weird to
+ * happen when we wake up.
+ * Then, do the setup for suspend, enter the state, and cleaup (after
+ * we've woken up).
+ */
+
+static int enter_state(suspend_state_t state)
+{
+ int error;
+
+ if (down_trylock(&pm_sem))
+ return -EBUSY;
+
+ if (state == PM_SUSPEND_DISK) {
+ error = pm_suspend_disk();
+ goto Unlock;
+ }
+
+ /* Suspend is hard to get right on SMP. */
+ if (num_online_cpus() != 1) {
+ error = -EPERM;
+ goto Unlock;
+ }
+
+ pr_debug("PM: Preparing system for suspend\n");
+ if ((error = suspend_prepare(state)))
+ goto Unlock;
+
+ pr_debug("PM: Entering state.\n");
+ error = suspend_enter(state);
+
+ pr_debug("PM: Finishing up.\n");
+ suspend_finish(state);
+ Unlock:
+ up(&pm_sem);
+ return error;
+}
+
+/*
+ * This is main interface to the outside world. It needs to be
+ * called from process context.
+ */
+int software_suspend(void)
+{
+ return enter_state(PM_SUSPEND_DISK);
+}
+
+
+/**
+ * pm_suspend - Externally visible function for suspending system.
+ * @state: Enumarted value of state to enter.
+ *
+ * Determine whether or not value is within range, get state
+ * structure, and enter (above).
+ */
+
+int pm_suspend(suspend_state_t state)
+{
+ if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
+ return enter_state(state);
+ return -EINVAL;
+}
+
+
+
+decl_subsys(power,NULL,NULL);
+
+
+/**
+ * state - control system power state.
+ *
+ * show() returns what states are supported, which is hard-coded to
+ * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
+ * 'disk' (Suspend-to-Disk).
+ *
+ * store() accepts one of those strings, translates it into the
+ * proper enumerated value, and initiates a suspend transition.
+ */
+
+static ssize_t state_show(struct subsystem * subsys, char * buf)
+{
+ int i;
+ char * s = buf;
+
+ for (i = 0; i < PM_SUSPEND_MAX; i++) {
+ if (pm_states[i])
+ s += sprintf(s,"%s ",pm_states[i]);
+ }
+ s += sprintf(s,"\n");
+ return (s - buf);
+}
+
+static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+ suspend_state_t state = PM_SUSPEND_STANDBY;
+ char ** s;
+ char *p;
+ int error;
+ int len;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
+ if (*s && !strncmp(buf, *s, len))
+ break;
+ }
+ if (*s)
+ error = enter_state(state);
+ else
+ error = -EINVAL;
+ return error ? error : n;
+}
+
+power_attr(state);
+
+static struct attribute * g[] = {
+ &state_attr.attr,
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+
+static int __init pm_init(void)
+{
+ int error = subsystem_register(&power_subsys);
+ if (!error)
+ error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+ return error;
+}
+
+core_initcall(pm_init);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
new file mode 100644
index 000000000000..61deda04e39e
--- /dev/null
+++ b/kernel/power/pm.c
@@ -0,0 +1,265 @@
+/*
+ * pm.c - Power management interface
+ *
+ * Copyright (C) 2000 Andrew Henroid
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/interrupt.h>
+
+int pm_active;
+
+/*
+ * Locking notes:
+ * pm_devs_lock can be a semaphore providing pm ops are not called
+ * from an interrupt handler (already a bad idea so no change here). Each
+ * change must be protected so that an unlink of an entry doesn't clash
+ * with a pm send - which is permitted to sleep in the current architecture
+ *
+ * Module unloads clashing with pm events now work out safely, the module
+ * unload path will block until the event has been sent. It may well block
+ * until a resume but that will be fine.
+ */
+
+static DECLARE_MUTEX(pm_devs_lock);
+static LIST_HEAD(pm_devs);
+
+/**
+ * pm_register - register a device with power management
+ * @type: device type
+ * @id: device ID
+ * @callback: callback function
+ *
+ * Add a device to the list of devices that wish to be notified about
+ * power management events. A &pm_dev structure is returned on success,
+ * on failure the return is %NULL.
+ *
+ * The callback function will be called in process context and
+ * it may sleep.
+ */
+
+struct pm_dev *pm_register(pm_dev_t type,
+ unsigned long id,
+ pm_callback callback)
+{
+ struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+ if (dev) {
+ memset(dev, 0, sizeof(*dev));
+ dev->type = type;
+ dev->id = id;
+ dev->callback = callback;
+
+ down(&pm_devs_lock);
+ list_add(&dev->entry, &pm_devs);
+ up(&pm_devs_lock);
+ }
+ return dev;
+}
+
+/**
+ * pm_unregister - unregister a device with power management
+ * @dev: device to unregister
+ *
+ * Remove a device from the power management notification lists. The
+ * dev passed must be a handle previously returned by pm_register.
+ */
+
+void pm_unregister(struct pm_dev *dev)
+{
+ if (dev) {
+ down(&pm_devs_lock);
+ list_del(&dev->entry);
+ up(&pm_devs_lock);
+
+ kfree(dev);
+ }
+}
+
+static void __pm_unregister(struct pm_dev *dev)
+{
+ if (dev) {
+ list_del(&dev->entry);
+ kfree(dev);
+ }
+}
+
+/**
+ * pm_unregister_all - unregister all devices with matching callback
+ * @callback: callback function pointer
+ *
+ * Unregister every device that would call the callback passed. This
+ * is primarily meant as a helper function for loadable modules. It
+ * enables a module to give up all its managed devices without keeping
+ * its own private list.
+ */
+
+void pm_unregister_all(pm_callback callback)
+{
+ struct list_head *entry;
+
+ if (!callback)
+ return;
+
+ down(&pm_devs_lock);
+ entry = pm_devs.next;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ entry = entry->next;
+ if (dev->callback == callback)
+ __pm_unregister(dev);
+ }
+ up(&pm_devs_lock);
+}
+
+/**
+ * pm_send - send request to a single device
+ * @dev: device to send to
+ * @rqst: power management request
+ * @data: data for the callback
+ *
+ * Issue a power management request to a given device. The
+ * %PM_SUSPEND and %PM_RESUME events are handled specially. The
+ * data field must hold the intended next state. No call is made
+ * if the state matches.
+ *
+ * BUGS: what stops two power management requests occurring in parallel
+ * and conflicting.
+ *
+ * WARNING: Calling pm_send directly is not generally recommended, in
+ * particular there is no locking against the pm_dev going away. The
+ * caller must maintain all needed locking or have 'inside knowledge'
+ * on the safety. Also remember that this function is not locked against
+ * pm_unregister. This means that you must handle SMP races on callback
+ * execution and unload yourself.
+ */
+
+static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+ int status = 0;
+ unsigned long prev_state, next_state;
+
+ if (in_interrupt())
+ BUG();
+
+ switch (rqst) {
+ case PM_SUSPEND:
+ case PM_RESUME:
+ prev_state = dev->state;
+ next_state = (unsigned long) data;
+ if (prev_state != next_state) {
+ if (dev->callback)
+ status = (*dev->callback)(dev, rqst, data);
+ if (!status) {
+ dev->state = next_state;
+ dev->prev_state = prev_state;
+ }
+ }
+ else {
+ dev->prev_state = prev_state;
+ }
+ break;
+ default:
+ if (dev->callback)
+ status = (*dev->callback)(dev, rqst, data);
+ break;
+ }
+ return status;
+}
+
+/*
+ * Undo incomplete request
+ */
+static void pm_undo_all(struct pm_dev *last)
+{
+ struct list_head *entry = last->entry.prev;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ if (dev->state != dev->prev_state) {
+ /* previous state was zero (running) resume or
+ * previous state was non-zero (suspended) suspend
+ */
+ pm_request_t undo = (dev->prev_state
+ ? PM_SUSPEND:PM_RESUME);
+ pm_send(dev, undo, (void*) dev->prev_state);
+ }
+ entry = entry->prev;
+ }
+}
+
+/**
+ * pm_send_all - send request to all managed devices
+ * @rqst: power management request
+ * @data: data for the callback
+ *
+ * Issue a power management request to a all devices. The
+ * %PM_SUSPEND events are handled specially. Any device is
+ * permitted to fail a suspend by returning a non zero (error)
+ * value from its callback function. If any device vetoes a
+ * suspend request then all other devices that have suspended
+ * during the processing of this request are restored to their
+ * previous state.
+ *
+ * WARNING: This function takes the pm_devs_lock. The lock is not dropped until
+ * the callbacks have completed. This prevents races against pm locking
+ * functions, races against module unload pm_unregister code. It does
+ * mean however that you must not issue pm_ functions within the callback
+ * or you will deadlock and users will hate you.
+ *
+ * Zero is returned on success. If a suspend fails then the status
+ * from the device that vetoes the suspend is returned.
+ *
+ * BUGS: what stops two power management requests occurring in parallel
+ * and conflicting.
+ */
+
+int pm_send_all(pm_request_t rqst, void *data)
+{
+ struct list_head *entry;
+
+ down(&pm_devs_lock);
+ entry = pm_devs.next;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ if (dev->callback) {
+ int status = pm_send(dev, rqst, data);
+ if (status) {
+ /* return devices to previous state on
+ * failed suspend request
+ */
+ if (rqst == PM_SUSPEND)
+ pm_undo_all(dev);
+ up(&pm_devs_lock);
+ return status;
+ }
+ }
+ entry = entry->next;
+ }
+ up(&pm_devs_lock);
+ return 0;
+}
+
+EXPORT_SYMBOL(pm_register);
+EXPORT_SYMBOL(pm_unregister);
+EXPORT_SYMBOL(pm_unregister_all);
+EXPORT_SYMBOL(pm_send_all);
+EXPORT_SYMBOL(pm_active);
+
+
diff --git a/kernel/power/power.h b/kernel/power/power.h
new file mode 100644
index 000000000000..cd6a3493cc0d
--- /dev/null
+++ b/kernel/power/power.h
@@ -0,0 +1,52 @@
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+
+/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+ we probably do not take enough locks for switching consoles, etc,
+ so bad things might happen.
+*/
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
+#endif
+
+
+struct swsusp_info {
+ struct new_utsname uts;
+ u32 version_code;
+ unsigned long num_physpages;
+ int cpus;
+ unsigned long image_pages;
+ unsigned long pagedir_pages;
+ suspend_pagedir_t * suspend_pagedir;
+ swp_entry_t pagedir[768];
+} __attribute__((aligned(PAGE_SIZE)));
+
+
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+extern int pm_suspend_disk(void);
+
+#else
+static inline int pm_suspend_disk(void)
+{
+ return -EPERM;
+}
+#endif
+extern struct semaphore pm_sem;
+#define power_attr(_name) \
+static struct subsys_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = 0644, \
+ }, \
+ .show = _name##_show, \
+ .store = _name##_store, \
+}
+
+extern struct subsystem power_subsys;
+
+extern int freeze_processes(void);
+extern void thaw_processes(void);
+
+extern int pm_prepare_console(void);
+extern void pm_restore_console(void);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
new file mode 100644
index 000000000000..715081b2d829
--- /dev/null
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,45 @@
+/*
+ * poweroff.c - sysrq handler to gracefully power down machine.
+ *
+ * This file is released under the GPL v2
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/workqueue.h>
+
+/*
+ * When the user hits Sys-Rq o to power down the machine this is the
+ * callback we use.
+ */
+
+static void do_poweroff(void *dummy)
+{
+ if (pm_power_off)
+ pm_power_off();
+}
+
+static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+
+static void handle_poweroff(int key, struct pt_regs *pt_regs,
+ struct tty_struct *tty)
+{
+ schedule_work(&poweroff_work);
+}
+
+static struct sysrq_key_op sysrq_poweroff_op = {
+ .handler = handle_poweroff,
+ .help_msg = "powerOff",
+ .action_msg = "Power Off",
+ .enable_mask = SYSRQ_ENABLE_BOOT,
+};
+
+static int pm_sysrq_init(void)
+{
+ register_sysrq_key('o', &sysrq_poweroff_op);
+ return 0;
+}
+
+subsys_initcall(pm_sysrq_init);
diff --git a/kernel/power/process.c b/kernel/power/process.c
new file mode 100644
index 000000000000..78d92dc6a1ed
--- /dev/null
+++ b/kernel/power/process.c
@@ -0,0 +1,121 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on
+ * suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+
+/*
+ * Timeout for stopping processes
+ */
+#define TIMEOUT (6 * HZ)
+
+
+static inline int freezeable(struct task_struct * p)
+{
+ if ((p == current) ||
+ (p->flags & PF_NOFREEZE) ||
+ (p->exit_state == EXIT_ZOMBIE) ||
+ (p->exit_state == EXIT_DEAD) ||
+ (p->state == TASK_STOPPED) ||
+ (p->state == TASK_TRACED))
+ return 0;
+ return 1;
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(unsigned long flag)
+{
+ /* Hmm, should we be allowed to suspend when there are realtime
+ processes around? */
+ long save;
+ save = current->state;
+ current->state = TASK_UNINTERRUPTIBLE;
+ pr_debug("%s entered refrigerator\n", current->comm);
+ printk("=");
+ current->flags &= ~PF_FREEZE;
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* We sent fake signal, clean it up */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->flags |= PF_FROZEN;
+ while (current->flags & PF_FROZEN)
+ schedule();
+ pr_debug("%s left refrigerator\n", current->comm);
+ current->state = save;
+}
+
+/* 0 = success, else # of processes that we failed to stop */
+int freeze_processes(void)
+{
+ int todo;
+ unsigned long start_time;
+ struct task_struct *g, *p;
+
+ printk( "Stopping tasks: " );
+ start_time = jiffies;
+ do {
+ todo = 0;
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ unsigned long flags;
+ if (!freezeable(p))
+ continue;
+ if ((p->flags & PF_FROZEN) ||
+ (p->state == TASK_TRACED) ||
+ (p->state == TASK_STOPPED))
+ continue;
+
+ /* FIXME: smp problem here: we may not access other process' flags
+ without locking */
+ p->flags |= PF_FREEZE;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ signal_wake_up(p, 0);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ todo++;
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ yield(); /* Yield is okay here */
+ if (time_after(jiffies, start_time + TIMEOUT)) {
+ printk( "\n" );
+ printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
+ return todo;
+ }
+ } while(todo);
+
+ printk( "|\n" );
+ BUG_ON(in_atomic());
+ return 0;
+}
+
+void thaw_processes(void)
+{
+ struct task_struct *g, *p;
+
+ printk( "Restarting tasks..." );
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (!freezeable(p))
+ continue;
+ if (p->flags & PF_FROZEN) {
+ p->flags &= ~PF_FROZEN;
+ wake_up_process(p);
+ } else
+ printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+ } while_each_thread(g, p);
+
+ read_unlock(&tasklist_lock);
+ schedule();
+ printk( " done\n" );
+}
+
+EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
new file mode 100644
index 000000000000..7fa7f6e2b7fb
--- /dev/null
+++ b/kernel/power/smp.c
@@ -0,0 +1,85 @@
+/*
+ * drivers/power/smp.c - Functions for stopping other CPUs.
+ *
+ * Copyright 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+
+static atomic_t cpu_counter, freeze;
+
+
+static void smp_pause(void * data)
+{
+ struct saved_context ctxt;
+ __save_processor_state(&ctxt);
+ printk("Sleeping in:\n");
+ dump_stack();
+ atomic_inc(&cpu_counter);
+ while (atomic_read(&freeze)) {
+ /* FIXME: restore takes place at random piece inside this.
+ This should probably be written in assembly, and
+ preserve general-purpose registers, too
+
+ What about stack? We may need to move to new stack here.
+
+ This should better be ran with interrupts disabled.
+ */
+ cpu_relax();
+ barrier();
+ }
+ atomic_dec(&cpu_counter);
+ __restore_processor_state(&ctxt);
+}
+
+static cpumask_t oldmask;
+
+void disable_nonboot_cpus(void)
+{
+ printk("Freezing CPUs (at %d)", smp_processor_id());
+ oldmask = current->cpus_allowed;
+ set_cpus_allowed(current, cpumask_of_cpu(0));
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ);
+ printk("...");
+ BUG_ON(smp_processor_id() != 0);
+
+ /* FIXME: for this to work, all the CPUs must be running
+ * "idle" thread (or we deadlock). Is that guaranteed? */
+
+ atomic_set(&cpu_counter, 0);
+ atomic_set(&freeze, 1);
+ smp_call_function(smp_pause, NULL, 0, 0);
+ while (atomic_read(&cpu_c