summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c10
-rw-r--r--kernel/auditfilter.c23
-rw-r--r--kernel/auditsc.c49
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpu.c56
-rw-r--r--kernel/debug/debug_core.c52
-rw-r--r--kernel/debug/kdb/kdb_bp.c37
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c269
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/events/core.c500
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdesc.c52
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1015
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/mcs_spinlock.h16
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/mutex.c62
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)9
-rw-r--r--kernel/locking/rtmutex.c7
-rw-r--r--kernel/locking/rwsem-spinlock.c2
-rw-r--r--kernel/locking/rwsem-xadd.c3
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module.c251
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c100
-rw-r--r--kernel/power/Kconfig17
-rw-r--r--kernel/power/qos.c91
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/range.c10
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h6
-rw-r--r--kernel/rcu/rcutorture.c66
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c113
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h62
-rw-r--r--kernel/rcu/tree_plugin.h271
-rw-r--r--kernel/rcu/tree_trace.c8
-rw-r--r--kernel/resource.c25
-rw-r--r--kernel/sched/completion.c18
-rw-r--r--kernel/sched/core.c163
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c79
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c15
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h22
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/taskstats.c13
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/hrtimer.c114
-rw-r--r--kernel/time/ntp.c11
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c12
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c53
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c69
-rw-r--r--kernel/trace/trace_kdb.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/workqueue.c25
89 files changed, 3166 insertions, 1424 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
def_bool y
depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config LOCK_SPIN_ON_OWNER
+ def_bool y
+ depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+
config ARCH_USE_QUEUE_RWLOCK
bool
diff --git a/kernel/Makefile b/kernel/Makefile
index 13af308f2460..07737e50fe6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y += power/
obj-y += printk/
obj-y += irq/
obj-y += rcu/
+obj-y += livepatch/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/audit.c b/kernel/audit.c
index f8f203e8018c..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
* This function doesn't consume an skb as might be expected since it has to
* copy it anyways.
*/
-static void kauditd_send_multicast_skb(struct sk_buff *skb)
+static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff *copy;
struct audit_net *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
* no reason for new multicast clients to continue with this
* non-compliance.
*/
- copy = skb_copy(skb, GFP_KERNEL);
+ copy = skb_copy(skb, gfp_mask);
if (!copy)
return;
- nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
}
/*
@@ -1100,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb)
}
/* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(int group)
+static int audit_bind(struct net *net, int group)
{
if (!capable(CAP_AUDIT_READ))
return -EPERM;
@@ -1940,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab)
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
nlh->nlmsg_len = ab->skb->len;
- kauditd_send_multicast_skb(ab->skb);
+ kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
/*
* The original kaudit unicast socket sends up messages with
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3598e13f2a65..4f68a326d92e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
f->val = 0;
- }
-
- if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
- struct pid *pid;
- rcu_read_lock();
- pid = find_vpid(f->val);
- if (!pid) {
- rcu_read_unlock();
- err = -ESRCH;
- goto exit_free;
- }
- f->val = pid_nr(pid);
- rcu_read_unlock();
+ entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
err = audit_field_valid(entry, f);
@@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
+ case AUDIT_LOGINUID_SET:
+ if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
+ data->fields[i] = AUDIT_LOGINUID;
+ data->values[i] = AUDIT_UID_UNSET;
+ break;
+ }
+ /* fallthrough if set */
default:
data->values[i] = f->val;
}
@@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
int i;
if (a->flags != b->flags ||
+ a->pflags != b->pflags ||
a->listnr != b->listnr ||
a->action != b->action ||
a->field_count != b->field_count)
@@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
new = &entry->rule;
new->vers_ops = old->vers_ops;
new->flags = old->flags;
+ new->pflags = old->pflags;
new->listnr = old->listnr;
new->action = old->action;
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c75522a83678..072566dd0caf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -72,6 +72,8 @@
#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/ctype.h>
+#include <linux/string.h>
+#include <uapi/linux/limits.h>
#include "audit.h"
@@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
list_for_each_entry_reverse(n, &context->names_list, list) {
- /* does the name pointer match? */
- if (!n->name || n->name->name != name->name)
+ if (!n->name || strcmp(n->name->name, name->name))
continue;
/* match the correct record type */
@@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
out_alloc:
- /* unable to find the name from a previous getname(). Allocate a new
- * anonymous entry.
- */
- n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
+ /* unable to find an entry with both a matching name and type */
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
+ /* unfortunately, while we may have a path name to record with the
+ * inode, we can't always rely on the string lasting until the end of
+ * the syscall so we need to create our own copy, it may fail due to
+ * memory allocation issues, but we do our best */
+ if (name) {
+ /* we can't use getname_kernel() due to size limits */
+ size_t len = strlen(name->name) + 1;
+ struct filename *new = __getname();
+
+ if (unlikely(!new))
+ goto out;
+
+ if (len <= (PATH_MAX - sizeof(*new))) {
+ new->name = (char *)(new) + sizeof(*new);
+ new->separate = false;
+ } else if (len <= PATH_MAX) {
+ /* this looks odd, but is due to final_putname() */
+ struct filename *new2;
+
+ new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
+ if (unlikely(!new2)) {
+ __putname(new);
+ goto out;
+ }
+ new2->name = (char *)new;
+ new2->separate = true;
+ new = new2;
+ } else {
+ /* we should never get here, but let's be safe */
+ __putname(new);
+ goto out;
+ }
+ strlcpy((char *)new->name, name->name, len);
+ new->uptr = NULL;
+ new->aname = n;
+ n->name = new;
+ n->name_put = true;
+ }
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- module_free(NULL, hdr);
+ module_memfree(hdr);
}
#endif /* CONFIG_BPF_JIT */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 088ac0b1b106..536edc2be307 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct fd f = fdget(ufd);
struct bpf_map *map;
- void *key, *value;
+ void *key, *value, *ptr;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
- err = -ENOENT;
- rcu_read_lock();
- value = map->ops->map_lookup_elem(map, key);
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
if (!value)
- goto err_unlock;
+ goto free_key;
+
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, map->value_size);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!ptr)
+ goto free_value;
err = -EFAULT;
if (copy_to_user(uvalue, value, map->value_size) != 0)
- goto err_unlock;
+ goto free_value;
err = 0;
-err_unlock:
- rcu_read_unlock();
+free_value:
+ kfree(value);
free_key:
kfree(key);
err_put:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb263d0caab3..04cfe8ace520 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
*
* And don't kill the default root.
*/
- if (css_has_online_children(&root->cgrp.self) ||
+ if (!list_empty(&root->cgrp.self.children) ||
root == &cgrp_dfl_root)
cgroup_put(&root->cgrp);
else
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
static struct {
struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
+ /* wait queue to wake up the active_writer */
+ wait_queue_head_t wq;
+ /* verifies that no writer will get active while readers are active */
+ struct mutex lock;
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
- int refcount;
- /* And allows lockless put_online_cpus(). */
- atomic_t puts_pending;
+ atomic_t refcount;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} cpu_hotplug = {
.active_writer = NULL,
+ .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
- .refcount = 0,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
.dep_map = {.name = "cpu_hotplug.lock" },
#endif
@@ -86,15 +87,6 @@ static struct {
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
-static void apply_puts_pending(int max)
-{
- int delta;
-
- if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
- delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
- cpu_hotplug.refcount -= delta;
- }
-}
void get_online_cpus(void)
{
@@ -103,8 +95,7 @@ void get_online_cpus(void)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
- apply_puts_pending(65536);
- cpu_hotplug.refcount++;
+ atomic_inc(&cpu_hotplug.refcount);
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
if (!mutex_trylock(&cpu_hotplug.lock))
return false;
cpuhp_lock_acquire_tryread();
- apply_puts_pending(65536);
- cpu_hotplug.refcount++;
+ atomic_inc(&cpu_hotplug.refcount);
mutex_unlock(&cpu_hotplug.lock);
return true;
}
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
void put_online_cpus(void)
{
+ int refcount;
+
if (cpu_hotplug.active_writer == current)
return;
- if (!mutex_trylock(&cpu_hotplug.lock)) {
- atomic_inc(&cpu_hotplug.puts_pending);
- cpuhp_lock_release();
- return;
- }
- if (WARN_ON(!cpu_hotplug.refcount))
- cpu_hotplug.refcount++; /* try to fix things up */
+ refcount = atomic_dec_return(&cpu_hotplug.refcount);
+ if (WARN_ON(refcount < 0)) /* try to fix things up */
+ atomic_inc(&cpu_hotplug.refcount);
+
+ if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+ wake_up(&cpu_hotplug.wq);
- if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
- wake_up_process(cpu_hotplug.active_writer);
- mutex_unlock(&cpu_hotplug.lock);
cpuhp_lock_release();
}
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
*/
void cpu_hotplug_begin(void)
{
- cpu_hotplug.active_writer = current;
+ DEFINE_WAIT(wait);
+ cpu_hotplug.active_writer = current;
cpuhp_lock_acquire();
+
for (;;) {
mutex_lock(&cpu_hotplug.lock);
- apply_puts_pending(1);
- if (likely(!cpu_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (likely(!atomic_read(&cpu_hotplug.refcount)))
+ break;
mutex_unlock(&cpu_hotplug.lock);
schedule();
}
+ finish_wait(&cpu_hotplug.wq, &wait);
}
void cpu_hotplug_done(void)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
* version 2. This program is licensed "as is" without any warranty of any
* kind, whether express or implied.
*/
+
+#define pr_fmt(fmt) "KGDB: " fmt
+
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
return err;
err = kgdb_arch_remove_breakpoint(&tmp);
if (err)
- printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
- "memory destroyed at: %lx", addr);
+ pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
+ addr);
return err;
}
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
if (error) {
ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx",
- kgdb_break[i].bpt_addr);
+ pr_info("BP install failed: %lx\n",
+ kgdb_break[i].bpt_addr);
continue;
}
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
continue;
error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
- kgdb_break[i].bpt_addr);
+ pr_info("BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
ret = error;
}
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
goto setundefined;
error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error)
- printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ pr_err("breakpoint remove failed: %lx\n",
kgdb_break[i].bpt_addr);
setundefined:
kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
if (print_wait) {
#ifdef CONFIG_KGDB_KDB
if (!dbg_kdb_mode)
- printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+ pr_crit("waiting... or $3#33 for KDB\n");
#else
- printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+ pr_crit("Waiting for remote debugger\n");
#endif
}
return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
exception_level = 0;
kgdb_skipexception(ks->ex_vector, ks->linux_regs);
dbg_activate_sw_breakpoints();
- printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
- addr);
+ pr_crit("re-enter error: breakpoint removed %lx\n", addr);
WARN_ON_ONCE(1);
return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
panic("Recursive entry to debugger");
}
- printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+ pr_crit("re-enter exception: ALL breakpoints killed\n");
#ifdef CONFIG_KGDB_KDB
/* Allow kdb to debug itself one level */
return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int cpu;
int trace_on = 0;
int online_cpus = num_online_cpus();
+ u64 time_left;
kgdb_info[ks->cpu].enter_kgdb++;
kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
- atomic_read(&slaves_in_kgdb)) != online_cpus)
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
cpu_relax();
+ if (!time_left)
+ pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
static void sysrq_handle_dbg(int key)
{
if (!dbg_io_ops) {
- printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ pr_crit("ERROR: No KGDB I/O module available\n");
return;
}
if (!kgdb_connected) {
#ifdef CONFIG_KGDB_KDB
if (!dbg_kdb_mode)
- printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+ pr_crit("KGDB or $3#33 for KDB\n");
#else
- printk(KERN_CRIT "Entering KGDB\n");
+ pr_crit("Entering KGDB\n");
#endif
}
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
{
kgdb_break_asap = 0;
- printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ pr_crit("Waiting for connection from remote gdb...\n");
kgdb_breakpoint();
}
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
if (dbg_io_ops) {
spin_unlock(&kgdb_registration_lock);
- printk(KERN_ERR "kgdb: Another I/O driver is already "
- "registered with KGDB.\n");
+ pr_err("Another I/O driver is already registered with KGDB\n");
return -EBUSY;
}
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
- printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
- new_dbg_io_ops->name);
+ pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
/* Arm KGDB now. */
kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
- printk(KERN_INFO
- "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ pr_info("Unregistered I/O driver %s, debugger disabled\n",
old_dbg_io_ops->name);
}
EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0, KDB_REPEAT_NONE);
-
- kdb_register_repeat("ss", kdb_ss, "",
- "Single Step", 1, KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+
+ kdb_register_flags("ss", kdb_ss, "",
+ "Single Step", 1,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
+ /* set CATASTROPHIC if the system contains unresponsive processors */
+ for_each_online_cpu(i)
+ if (!kgdb_info[i].enter_kgdb)
+ KDB_FLAG_SET(CATASTROPHIC);
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..7b40c5f07dce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
*/
#include <linux/ctype.h>
+#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
#include <linux/slab.h>
#include "kdb_private.h"
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "kdb."
+
+static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
+module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
+
#define GREP_LEN 256
char kdb_grep_string[GREP_LEN];
int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
KDBMSG(BADLENGTH, "Invalid length field"),
KDBMSG(NOBP, "No Breakpoint exists"),
KDBMSG(BADADDR, "Invalid address"),
+ KDBMSG(NOPERM, "Permission denied"),
};
#undef KDBMSG
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
+ * Check whether the flags of the current command and the permissions
+ * of the kdb console has allow a command to be run.
+ */
+static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+ bool no_args)
+{
+ /* permissions comes from userspace so needs massaging slightly */
+ permissions &= KDB_ENABLE_MASK;
+ permissions |= KDB_ENABLE_ALWAYS_SAFE;
+
+ /* some commands change group when launched with no arguments */
+ if (no_args)
+ permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
+
+ flags |= KDB_ENABLE_ALL;
+
+ return permissions & flags;
+}
+
+/*
* kdbgetenv - This function will return the character string value of
* an environment variable.
* Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
kdb_symtab_t symtab;
/*
+ * If the enable flags prohibit both arbitrary memory access
+ * and flow control then there are no reasonable grounds to
+ * provide symbol lookup.
+ */
+ if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
+ kdb_cmd_enabled, false))
+ return KDB_NOPERM;
+
+ /*
* Process arguments which follow the following syntax:
*
* symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
if (!s->count)
s->usable = 0;
if (s->usable)
- kdb_register(s->name, kdb_exec_defcmd,
- s->usage, s->help, 0);
+ /* macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is
+ * safety checked individually.
+ */
+ kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
+ s->help, 0,
+ KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
if (i < kdb_max_commands) {
int result;
+
+ if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ return KDB_NOPERM;
+
KDB_STATE_SET(CMD);
result = (*tp->cmd_func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- switch (tp->cmd_repeat) {
- case KDB_REPEAT_NONE:
- argc = 0;
- if (argv[0])
- *(argv[0]) = '\0';
- break;
- case KDB_REPEAT_NO_ARGS:
- argc = 1;
- if (argv[1])
- *(argv[1]) = '\0';
- break;
- case KDB_REPEAT_WITH_ARGS:
- break;
- }
+
+ if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ return result;
+
+ argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ if (argv[argc])
+ *(argv[argc]) = '\0';
return result;
}
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
*/
static int kdb_sr(int argc, const char **argv)
{
+ bool check_mask =
+ !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
+
if (argc != 1)
return KDB_ARGCOUNT;
+
kdb_trap_printk++;
- __handle_sysrq(*argv[1], false);
+ __handle_sysrq(*argv[1], check_mask);
kdb_trap_printk--;
return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4ld ", module_refcount(mod));
+ kdb_printf("%4d ", module_refcount(mod));
#endif
if (mod->state == MODULE_STATE_GOING)
kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i)) {
state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
return 0;
if (!kt->cmd_name)
continue;
+ if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ continue;
if (strlen(kt->cmd_usage) > 20)
space = "\n ";
kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
}
/*
- * kdb_register_repeat - This function is used to register a kernel
+ * kdb_register_flags - This function is used to register a kernel
* debugger command.
* Inputs:
* cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
* zero for success, one if a duplicate command.
*/
#define kdb_command_extend 50 /* arbitrary */
-int kdb_register_repeat(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_repeat_t repeat)
+int kdb_register_flags(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_cmdflags_t flags)
{
int i;
kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
kp->cmd_func = func;
kp->cmd_usage = usage;
kp->cmd_help = help;
- kp->cmd_flags = 0;
kp->cmd_minlen = minlen;
- kp->cmd_repeat = repeat;
+ kp->cmd_flags = flags;
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_repeat);
+EXPORT_SYMBOL_GPL(kdb_register_flags);
/*
* kdb_register - Compatibility register function for commands that do
* not need to specify a repeat state. Equivalent to
- * kdb_register_repeat with KDB_REPEAT_NONE.
+ * kdb_register_flags with flags set to 0.
* Inputs:
* cmd Command name
* func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
char *help,
short minlen)
{
- return kdb_register_repeat(cmd, func, usage, help, minlen,
- KDB_REPEAT_NONE);
+ return kdb_register_flags(cmd, func, usage, help, minlen, 0);
}
EXPORT_SYMBOL_GPL(kdb_register);
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
for_each_kdbcmd(kp, i)
kp->cmd_name = NULL;
- kdb_register_repeat("md", kdb_md, "<vaddr>",
+ kdb_register_flags("md", kdb_md, "<vaddr>",
"Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("rd", kdb_rd, "",
- "Display Registers", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0,
+ KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1,
+ KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("rd", kdb_rd, "",
+ "Display Registers", 0,
+ KDB_ENABLE_REG_READ);
+ kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0,
+ KDB_ENABLE_REG_WRITE);
+ kdb_register_flags("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1,
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
- KDB_REPEAT_NONE);
- kdb_register_repeat("env", kdb_env, "",
- "Show environment variables", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("set", kdb_set, "",
- "Set environment variables", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("help", kdb_help, "",
- "Display Help Message", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("?", kdb_help, "",
- "Display Help Message", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("env", kdb_env, "",
+ "Show environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("set", kdb_set, "",
+ "Set environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("help", kdb_help, "",
+ "Display Help Message", 1,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("?", kdb_help, "",
+ "Display Help Message", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0,
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, 0);
+ kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0,
+ KDB_ENABLE_REBOOT);
#if defined(CONFIG_MODULES)
- kdb_register_repeat("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0,
+ KDB_ENABLE_INSPECT);
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_repeat("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
#endif
if (arch_kgdb_ops.enable_nmi) {
- kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
- }
- kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("summary", kdb_summary, "",
- "Summarize the system", 4, KDB_REPEAT_NONE);
- kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3, KDB_REPEAT_NONE);
- kdb_register_repeat("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ }
+ kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0,
+ KDB_ENABLE_SIGNAL);
+ kdb_register_flags("summary", kdb_summary, "",
+ "Summarize the system", 4,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+ "Display per_cpu variables", 3,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
kdb_func_t cmd_func; /* Function to execute command */
char *cmd_usage; /* Usage String for this command */
char *cmd_help; /* Help message for this command */
- short cmd_flags; /* Parsing flags */
short cmd_minlen; /* Minimum legal # command
* chars required */
- kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
+ kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
} kdbtab_t;
extern int kdb_bt(int, const char **); /* KDB display back trace */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 113b837470cd..7f2fbb8b5069 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
*/
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- struct list_head *head = this_cpu_ptr(&rotation_list);
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
- list_add(&cpuctx->rotation_list, head);
+ WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+ list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+ WARN_ON(!irqs_disabled());
+
+ WARN_ON(list_empty(&ctx->active_ctx_list));
+
+ list_del_init(&ctx->active_ctx_list);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
}
/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ * perf_remove_from_context();
+ * synchronize_rcu();
+ * perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ * task_struct::perf_event_mutex
+ * perf_event_context::mutex
+ * perf_event_context::lock
+ * perf_event::child_mutex;
+ * perf_event::mmap_mutex
+ * mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
+{
+ struct perf_event_context *ctx;
+
+again:
+ rcu_read_lock();
+ ctx = ACCESS_ONCE(event->ctx);
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_nested(&ctx->mutex, nesting);
+ if (event->ctx != ctx) {
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+
+ return ctx;
+}
+
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+ return perf_event_ctx_lock_nested(event, 0);
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+}
+
+/*
* This must be done under the ctx->lock, such as to serialize against
* context_equiv(), therefore we cannot call put_ctx() since that might end up
* calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_branch_stack++;
list_add_rcu(&event->event_entry, &ctx->event_list);
- if (!ctx->nr_events)
- perf_pmu_rotate_start(ctx->pmu);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
if (group_leader == event)
return;
+ WARN_ON_ONCE(group_leader->ctx != event->ctx);
+
if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
!is_software_event(event))
group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
+
+ WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
{
u64 tstamp = perf_event_time(event);
u64 delta;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
/*
* An event which could not be activated because of
* filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu--;
- ctx->nr_active--;
+ if (!--ctx->nr_active)
+ perf_event_ctx_deactivate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
}
raw_spin_unlock_irq(&ctx->lock);
}
+
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_disable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
EXPORT_SYMBOL_GPL(perf_event_disable);
static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu++;
- ctx->nr_active++;
+ if (!ctx->nr_active++)
+ perf_event_ctx_activate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
@@ -2158,7 +2271,7 @@ unlock:
* perf_event_for_each_child or perf_event_for_each as described
* for perf_event_disable.
*/
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
out:
raw_spin_unlock_irq(&ctx->lock);
}
+
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_enable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
EXPORT_SYMBOL_GPL(perf_event_enable);
-int perf_event_refresh(struct perf_event *event, int refresh)
+static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
* not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
return -EINVAL;
atomic_add(refresh, &event->event_limit);
- perf_event_enable(event);
+ _perf_event_enable(event);
return 0;
}
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_refresh(event, refresh);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(perf_event_refresh);
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
-
- /*
- * Since these rotations are per-cpu, we need to ensure the
- * cpu-context we got scheduled on is actually rotating.
- */
- perf_pmu_rotate_start(ctx->pmu);
}
/*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
list_rotate_left(&ctx->flexible_groups);
}
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
- int rotate = 0, remove = 1;
+ int rotate = 0;
if (cpuctx->ctx.nr_events) {
- remove = 0;
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
rotate = 1;
}
ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
- remove = 0;
if (ctx->nr_events != ctx->nr_active)
rotate = 1;
}
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
done:
- if (remove)
- list_del_init(&cpuctx->rotation_list);
return rotate;
}
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&rotation_list);
- struct perf_cpu_context *cpuctx, *tmp;
- struct perf_event_context *ctx;
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
+ struct perf_event_context *ctx, *tmp;
int throttled;
WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
- list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
- ctx = &cpuctx->ctx;
+ list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- ctx = cpuctx->task_ctx;
- if (ctx)
- perf_adjust_freq_unthr_context(ctx, throttled);
- }
}
static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->active_ctx_list);
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
rcu_read_unlock();
if (owner) {
- mutex_lock(&owner->perf_event_mutex);
+ /*
+ * If we're here through perf_event_exit_task() we're already
+ * holding ctx->mutex which would be an inversion wrt. the
+ * normal lock order.
+ *
+ * However we can safely take this lock because its the child
+ * ctx->mutex.
+ */
+ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
/*
* We have to re-check the event->owner field, if it is cleared
* we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
*/
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
+ struct perf_event_context *ctx;
if (!atomic_long_dec_and_test(&event->refcount))
return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
if (!is_kernel_event(event))
perf_remove_from_owner(event);
- WARN_ON_ONCE(ctx->parent_ctx);
/*
* There are two ways this annotation is useful:
*
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
* the last filedesc died, so there is no possibility
* to trigger the AB-BA case.
*/
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+ WARN_ON_ONCE(ctx->parent_ctx);
perf_remove_from_context(event, true);
mutex_unlock(&ctx->mutex);
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
u64 read_format, char __user *buf)
{
struct perf_event *leader = event->group_leader, *sub;
- int n = 0, size = 0, ret = -EFAULT;
struct perf_event_context *ctx = leader->ctx;
- u64 values[5];
+ int n = 0, size = 0, ret;
u64 count, enabled, running;
+ u64 values[5];
+
+ lockdep_assert_held(&ctx->mutex);
- mutex_lock(&ctx->mutex);
count = perf_event_read_value(leader, &enabled, &running);
values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
size = n * sizeof(u64);
if (copy_to_user(buf, values, size))
- goto unlock;
+ return -EFAULT;
ret = size;
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
size = n * sizeof(u64);
if (copy_to_user(buf + ret, values, size)) {
- ret = -EFAULT;
- goto unlock;
+ return -EFAULT;
}
ret += size;
}
-unlock:
- mutex_unlock(&ctx->mutex);
return ret;
}
@@ -3660,8 +3786,14 @@ static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = perf_read_hw(event, buf, count);
+ perf_event_ctx_unlock(event, ctx);
- return perf_read_hw(event, buf, count);
+ return ret;
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
return events;
}
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
{
(void)perf_event_read(event);
local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
struct perf_event *child;
WARN_ON_ONCE(event->ctx->parent_ctx);
+
mutex_lock(&event->child_mutex);
func(event);
list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
struct perf_event_context *ctx = event->ctx;
struct perf_event *sibling;
- WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
+ lockdep_assert_held(&ctx->mutex);
+
event = event->group_leader;
perf_event_for_each_child(event, func);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
perf_event_for_each_child(sibling, func);
- mutex_unlock(&ctx->mutex);
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
- struct perf_event *event = file->private_data;
void (*func)(struct perf_event *);
u32 flags = arg;
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
- func = perf_event_enable;
+ func = _perf_event_enable;
break;
case PERF_EVENT_IOC_DISABLE:
- func = perf_event_disable;
+ func = _perf_event_disable;
break;
case PERF_EVENT_IOC_RESET:
- func = perf_event_reset;
+ func = _perf_event_reset;
break;
case PERF_EVENT_IOC_REFRESH:
- return perf_event_refresh(event, arg);
+ return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return 0;
}
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ long ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_ioctl(event, cmd, arg);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
int perf_event_task_enable(void)
{
+ struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
- list_for_each_entry(event, &current->perf_event_list, owner_entry)
- perf_event_for_each_child(event, perf_event_enable);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_enable);
+ perf_event_ctx_unlock(event, ctx);
+ }
mutex_unlock(&current->perf_event_mutex);
return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
int perf_event_task_disable(void)
{
+ struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
- list_for_each_entry(event, &current->perf_event_list, owner_entry)
- perf_event_for_each_child(event, perf_event_disable);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_disable);
+ perf_event_ctx_unlock(event, ctx);
+ }
mutex_unlock(&current->perf_event_mutex);
return 0;
@@ -4461,18 +4613,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
static void perf_sample_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
{
- if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
- regs_user->abi = perf_reg_abi(current);
+ if (user_mode(regs)) {
+ regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
+ } else if (current->mm) {
+ perf_get_regs_user(regs_user, regs, regs_user_copy);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
@@ -4951,7 +5099,8 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs);
+ perf_sample_regs_user(&data->regs_user, regs,
+ &data->regs_user_copy);
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -5892,6 +6041,8 @@ end:
rcu_read_unlock();
}
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
int perf_swevent_get_recursion_context(void)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5907,21 +6058,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
put_recursion_context(swhash->recursion, rctx);
}
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;
- int rctx;
- preempt_disable_notrace();
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
+ if (WARN_ON_ONCE(!regs))
return;
perf_sample_data_init(&data, addr, 0);
-
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (unlikely(rctx < 0))
+ goto fail;
+
+ ___perf_sw_event(event_id, nr, regs, addr);
perf_swevent_put_recursion_context(rctx);
+fail:
preempt_enable_notrace();
}
@@ -6779,12 +6939,10 @@ skip_type:
__perf_event_init_context(&cpuctx->ctx);
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
__perf_cpu_hrtimer_init(cpuctx, cpu);
- INIT_LIST_HEAD(&cpuctx->rotation_list);
cpuctx->unique_pmu = pmu;
}
@@ -6857,6 +7015,20 @@ void perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+ int ret;
+
+ if (!try_module_get(pmu->module))
+ return -ENODEV;
+ event->pmu = pmu;
+ ret = pmu->event_init(event);
+ if (ret)
+ module_put(pmu->module);
+
+ return ret;
+}
+
struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
@@ -6869,24 +7041,14 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
- if (!try_module_get(pmu->module)) {
- pmu = ERR_PTR(-ENODEV);
- goto unlock;
- }
- event->pmu = pmu;
- ret = pmu->event_init(event);
+ ret = perf_try_init_event(pmu, event);
if (ret)
pmu = ERR_PTR(ret);
goto unlock;
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (!try_module_get(pmu->module)) {
- pmu = ERR_PTR(-ENODEV);
- goto unlock;
- }
- event->pmu = pmu;
- ret = pmu->event_init(event);
+ ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -7250,6 +7412,15 @@ out:
return ret;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -7265,7 +7436,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx;
+ struct perf_event_context *ctx, *uninitialized_var(gctx);
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -7423,7 +7594,19 @@ SYSCALL_DEFINE5(perf_event_open,
* task or CPU context:
*/
if (move_group) {
- if (group_leader->ctx->type != ctx->type)
+ /*
+ * Make sure we're both on the same task, or both
+ * per-cpu events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
+ */
+ if (group_leader->cpu != event->cpu)
goto err_context;
} else {
if (group_leader->ctx != ctx)
@@ -7451,43 +7634,68 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- struct perf_event_context *gctx = group_leader->ctx;
-
- mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader, false);
+ gctx = group_leader->ctx;
/*
- * Removing from the context ends up with disabled
- * event. What we want here is event in the initial
- * startup state, ready to be add into new context.
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
*/
- perf_event__state_init(group_leader);
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ perf_remove_from_context(group_leader, false);
+
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling, false);
- perf_event__state_init(sibling);
put_ctx(gctx);
}
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
+ } else {
+ mutex_lock(&ctx->mutex);
}
WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
if (move_group) {
+ /*
+ * Wait for everybody to stop referencing the events through
+ * the old lists, before installing it on new lists.
+ */
synchronize_rcu();
- perf_install_in_context(ctx, group_leader, event->cpu);
- get_ctx(ctx);
+
+ /*
+ * Install the group siblings before the group leader.
+ *
+ * Because a group leader will try and install the entire group
+ * (through the sibling list, which is still in-tact), we can
+ * end up with siblings installed in the wrong context.
+ *
+ * By installing siblings first we NO-OP because they're not
+ * reachable through the group lists.
+ */
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, event->cpu);
+ perf_event__state_init(sibling);
+ perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
+ perf_install_in_context(ctx, group_leader, group_leader->cpu);
+ get_ctx(ctx);
}
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
+
+ if (move_group) {
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ }
mutex_unlock(&ctx->mutex);
put_online_cpus();
@@ -7595,7 +7803,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
- mutex_lock(&src_ctx->mutex);
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
perf_remove_from_context(event, false);
@@ -7603,11 +7815,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
}
- mutex_unlock(&src_ctx->mutex);
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
synchronize_rcu();
- mutex_lock(&dst_ctx->mutex);
+ /*
+ * Re-instate events in 2 passes.
+ *
+ * Skip over group leaders and only install siblings on this first
+ * pass, siblings will not get enabled without a leader, however a
+ * leader will enable its siblings, even if those are still on the old
+ * context.
+ */
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ if (event->group_leader == event)
+ continue;
+
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+
+ /*
+ * Once all the siblings are setup properly, install the group leaders
+ * to make it go.
+ */
list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7617,6 +7854,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
get_ctx(dst_ctx);
}
mutex_unlock(&dst_ctx->mutex);
+ mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
@@ -7803,14 +8041,19 @@ static void perf_free_event(struct perf_event *event,
put_event(parent);
+ raw_spin_lock_irq(&ctx->lock);
perf_group_detach(event);
list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
free_event(event);
}
/*
- * free an unexposed, unused context as created by inheritance by
+ * Free an unexposed, unused context as created by inheritance by
* perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
*/
void perf_event_free_task(struct task_struct *task)
{
@@ -8129,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+ INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
}
}
@@ -8150,22 +8393,11 @@ static void perf_event_init_cpu(int cpu)
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- WARN_ON(!irqs_disabled());
-
- list_del_init(&cpuctx->rotation_list);
-}
-
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
- perf_pmu_rotate_stop(ctx->pmu);
-
rcu_read_lock();
list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
__perf_remove_from_context(&re);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/circ_buf.h>
+#include <linux/poll.h>
#include "internal.h"
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, POLL_IN);
+ atomic_set(&handle->rb->poll, POLLIN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1ea4369890a3..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
+ /*
+ * We can race with wait_task_zombie() from another thread.
+ * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+ * can't confuse the checks below.
+ */
+ int exit_state = ACCESS_ONCE(p->exit_state);
int ret;
- if (unlikely(p->exit_state == EXIT_DEAD))
+ if (unlikely(exit_state == EXIT_DEAD))
return 0;
ret = eligible_child(wo, p);
@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (unlikely(p->exit_state == EXIT_TRACE)) {
+ if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us.
@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
}
/* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) {
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..b379d9abddc7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- if (unlikely(tmp->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(tmp,
- &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..4eeb63de7e54 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- return futex_lock_pi(uaddr, flags, val, timeout, 0);
+ return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+ return futex_lock_pi(uaddr, flags, NULL, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4332d766619d..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a1782f88f0af..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -132,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
static inline void free_masks(struct irq_desc *desc) { }
#endif
+void irq_lock_sparse(void)
+{
+ mutex_lock(&sparse_irq_lock);
+}
+
+void irq_unlock_sparse(void)
+{
+ mutex_unlock(&sparse_irq_lock);
+}
+
static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
{
struct irq_desc *desc;
@@ -168,6 +178,12 @@ static void free_desc(unsigned int irq)
unregister_irq_proc(irq, desc);
+ /*
+ * sparse_irq_lock protects also show_interrupts() and
+ * kstat_irq_usr(). Once we deleted the descriptor from the
+ * sparse tree we can free it. Access in proc will fail to
+ * lookup the descriptor.
+ */
mutex_lock(&sparse_irq_lock);
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
@@ -574,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
}
+/**
+ * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
+ * @irq: The interrupt number
+ * @cpu: The cpu number
+ *
+ * Returns the sum of interrupt counts on @cpu since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -582,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
+/**
+ * kstat_irqs - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -594,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
+
+/**
+ * kstat_irqs_usr - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. Contrary to kstat_irqs() this can be called from any
+ * preemptible context. It's protected against concurrent removal of
+ * an interrupt descriptor when sparse irqs are enabled.
+ */
+unsigned int kstat_irqs_usr(unsigned int irq)
+{
+ int sum;
+
+ irq_lock_sparse();
+ sum = kstat_irqs(irq);
+ irq_unlock_sparse();
+ return sum;
+}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..9dc9bfd8a678 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
#include "internals.h"
+/*
+ * Access rules:
+ *
+ * procfs protects read/write of /proc/irq/N/ files against a
+ * concurrent free of the interrupt descriptor. remove_proc_entry()
+ * immediately prevents new read/writes to happen and waits for
+ * already running read/write functions to complete.
+ *
+ * We remove the proc entries first and then delete the interrupt
+ * descriptor from the radix tree and free it. So it is guaranteed
+ * that irq_to_desc(N) is valid as long as the read/writes are
+ * permitted by procfs.
+ *
+ * The read from /proc/interrupts is a different problem because there
+ * is no protection. So the lookup and the access to irqdesc
+ * information must be protected by sparse_irq_lock.
+ */
static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
@@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
+ irq_lock_sparse();
desc = irq_to_desc(i);
if (!desc)
- return 0;
+ goto outsparse;
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_online_cpu(j)
@@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
out:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+outsparse:
+ irq_unlock_sparse();
return 0;
}
#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..c85277639b34 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -2512,7 +2512,7 @@ static int kexec_apply_relocations(struct kimage *image)
continue;
/*
- * Respective archicture needs to provide support for applying
+ * Respective architecture needs to provide support for applying
* relocations of type SHT_RELA/SHT_REL.
*/
if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..ee619929cf90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
static void free_insn_page(void *page)
{
- module_free(NULL, page);
+ module_memfree(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config HAVE_LIVEPATCH
+ bool
+ help
+ Arch supports kernel live patching
+
+config LIVEPATCH
+ bool "Kernel Live Patching"
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on MODULES
+ depends on SYSFS
+ depends on KALLSYMS_ALL
+ depends on HAVE_LIVEPATCH
+ help
+ Say Y here if you want to support kernel live patching.
+ This option has no runtime impact until a kernel "patch"
+ module uses the interface provided by this option to register
+ a patch, causing calls to patched functions to be redirected
+ to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..ff7f47d026ac
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1015 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr. This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list. The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node: node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops: registered ftrace ops struct
+ */
+struct klp_ops {
+ struct list_head node;
+ struct list_head func_stack;
+ struct ftrace_ops fops;
+};
+
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them. References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
+static DEFINE_MUTEX(klp_mutex);
+
+static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
+
+static struct kobject *klp_root_kobj;
+
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ list_for_each_entry(ops, &klp_ops, node) {
+ func = list_first_entry(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (func->old_addr == old_addr)
+ return ops;
+ }
+
+ return NULL;
+}
+
+static bool klp_is_module(struct klp_object *obj)
+{
+ return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+ if (!klp_is_module(obj))
+ return;
+
+ mutex_lock(&module_mutex);
+ /*
+ * We don't need to take a reference on the module here because we have
+ * the klp_mutex, which is also taken by the module notifier. This
+ * prevents any module from unloading until we release the klp_mutex.
+ */
+ obj->mod = find_module(obj->name);
+ mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+ struct klp_patch *mypatch;
+
+ list_for_each_entry(mypatch, &klp_patches, list)
+ if (mypatch == patch)
+ return true;
+
+ return false;
+}
+
+static bool klp_initialized(void)
+{
+ return klp_root_kobj;
+}
+
+struct klp_find_arg {
+ const char *objname;
+ const char *name;
+ unsigned long addr;
+ /*
+ * If count == 0, the symbol was not found. If count == 1, a unique
+ * match was found and addr is set. If count > 1, there is
+ * unresolvable ambiguity among "count" number of symbols with the same
+ * name in the same object.
+ */
+ unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if ((mod && !args->objname) || (!mod && args->objname))
+ return 0;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ if (args->objname && strcmp(args->objname, mod->name))
+ return 0;
+
+ /*
+ * args->addr might be overwritten if another match is found
+ * but klp_find_object_symbol() handles this and only returns the
+ * addr if count == 1.
+ */
+ args->addr = addr;
+ args->count++;
+
+ return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+ unsigned long *addr)
+{
+ struct klp_find_arg args = {
+ .objname = objname,
+ .name = name,
+ .addr = 0,
+ .count = 0
+ };
+
+ kallsyms_on_each_symbol(klp_find_callback, &args);
+
+ if (args.count == 0)
+ pr_err("symbol '%s' not found in symbol table\n", name);
+ else if (args.count > 1)
+ pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+ args.count, name, objname);
+ else {
+ *addr = args.addr;
+ return 0;
+ }
+
+ *addr = 0;
+ return -EINVAL;
+}
+
+struct klp_verify_args {
+ const char *name;
+ const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_verify_args *args = data;
+
+ if (!mod &&
+ !strcmp(args->name, name) &&
+ args->addr == addr)
+ return 1;
+
+ return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+ struct klp_verify_args args = {
+ .name = name,
+ .addr = addr,
+ };
+
+ if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+ return 0;
+
+ pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
+ name, addr);
+ return -EINVAL;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+ struct klp_func *func)
+{
+ int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+ /* KASLR is enabled, disregard old_addr from user */
+ func->old_addr = 0;
+#endif
+
+ if (!func->old_addr || klp_is_module(obj))
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ &func->old_addr);
+ else
+ ret = klp_verify_vmlinux_symbol(func->old_name,
+ func->old_addr);
+
+ return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+ unsigned long *addr)
+{
+ const struct kernel_symbol *sym;
+
+ /* first, check if it's an exported symbol */
+ preempt_disable();
+ sym = find_symbol(name, NULL, NULL, true, true);
+ preempt_enable();
+ if (sym) {
+ *addr = sym->value;
+ return 0;
+ }
+
+ /* otherwise check if it's in another .o within the patch module */
+ return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+ struct klp_object *obj)
+{
+ int ret;
+ struct klp_reloc *reloc;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ if (WARN_ON(!obj->relocs))
+ return -EINVAL;
+
+ for (reloc = obj->relocs; reloc->name; reloc++) {
+ if (!klp_is_module(obj)) {
+ ret = klp_verify_vmlinux_symbol(reloc->name,
+ reloc->val);
+ if (ret)
+ return ret;
+ } else {
+ /* module, reloc->val needs to be discovered */
+ if (reloc->external)
+ ret = klp_find_external_symbol(pmod,
+ reloc->name,
+ &reloc->val);
+ else
+ ret = klp_find_object_symbol(obj->mod->name,
+ reloc->name,
+ &reloc->val);
+ if (ret)
+ return ret;
+ }
+ ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+ reloc->val + reloc->addend);
+ if (ret) {
+ pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+ reloc->name, reloc->val, ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct pt_regs *regs)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ ops = container_of(fops, struct klp_ops, fops);
+
+ rcu_read_lock();
+ func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+ stack_node);
+ rcu_read_unlock();
+
+ if (WARN_ON_ONCE(!func))
+ return;
+
+ klp_arch_set_pc(regs, (unsigned long)func->new_func);
+}
+
+static int klp_disable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(func->state != KLP_ENABLED))
+ return -EINVAL;
+
+ if (WARN_ON(!func->old_addr))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_addr);
+ if (WARN_ON(!ops))
+ return -EINVAL;
+
+ if (list_is_singular(&ops->func_stack)) {
+ ret = unregister_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ return ret;
+ }
+
+ ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+ if (ret)
+ pr_warn("function unregister succeeded but failed to clear the filter\n");
+
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ } else {
+ list_del_rcu(&func->stack_node);
+ }
+
+ func->state = KLP_DISABLED;
+
+ return 0;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(!func->old_addr))
+ return -EINVAL;
+
+ if (WARN_ON(func->state != KLP_DISABLED))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_addr);
+ if (!ops) {
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ ops->fops.func = klp_ftrace_handler;
+ ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_IPMODIFY;
+
+ list_add(&ops->node, &klp_ops);
+
+ INIT_LIST_HEAD(&ops->func_stack);
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+
+ ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+ if (ret) {
+ pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+ func->old_name, ret);
+ goto err;
+ }
+
+ ret = register_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+ goto err;
+ }
+
+
+ } else {
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+ }
+
+ func->state = KLP_ENABLED;
+
+ return 0;
+
+err:
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ return ret;
+}
+
+static int klp_disable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ if (func->state != KLP_ENABLED)
+ continue;
+
+ ret = klp_disable_func(func);
+ if (ret)
+ return ret;
+ }
+
+ obj->state = KLP_DISABLED;
+
+ return 0;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (WARN_ON(obj->state != KLP_DISABLED))
+ return -EINVAL;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_enable_func(func);
+ if (ret)
+ goto unregister;
+ }
+ obj->state = KLP_ENABLED;
+
+ return 0;
+
+unregister:
+ WARN_ON(klp_disable_object(obj));
+ return ret;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ /* enforce stacking: only the last enabled patch can be disabled */
+ if (!list_is_last(&patch->list, &klp_patches) &&
+ list_next_entry(patch, list)->state == KLP_ENABLED)
+ return -EBUSY;
+
+ pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (obj->state != KLP_ENABLED)
+ continue;
+
+ ret = klp_disable_object(obj);
+ if (ret)
+ return ret;
+ }
+
+ patch->state = KLP_DISABLED;
+
+ return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch: The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (patch->state == KLP_DISABLED) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_disable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (WARN_ON(patch->state != KLP_DISABLED))
+ return -EINVAL;
+
+ /* enforce stacking: only the first disabled patch can be enabled */
+ if (patch->list.prev != &klp_patches &&
+ list_prev_entry(patch, list)->state == KLP_DISABLED)
+ return -EBUSY;
+
+ pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+ add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ klp_find_object_module(obj);
+
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_enable_object(obj);
+ if (ret)
+ goto unregister;
+ }
+
+ patch->state = KLP_ENABLED;
+
+ return 0;
+
+unregister:
+ WARN_ON(__klp_disable_patch(patch));
+ return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch: The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_enable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return -EINVAL;
+
+ if (val != KLP_DISABLED && val != KLP_ENABLED)
+ return -EINVAL;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+
+ mutex_lock(&klp_mutex);
+
+ if (val == patch->state) {
+ /* already in requested state */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (val == KLP_ENABLED) {
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+ } else {
+ ret = __klp_disable_patch(patch);
+ if (ret)
+ goto err;
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return count;
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+ &enabled_kobj_attr.attr,
+ NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+ /*
+ * Once we have a consistency model we'll need to module_put() the
+ * patch module here. See klp_register_patch() for more details.
+ */
+}
+
+static struct kobj_type klp_ktype_patch = {
+ .release = klp_kobj_release_patch,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_func = {
+ .release = klp_kobj_release_func,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+ struct klp_func *limit)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name && func != limit; func++)
+ kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ obj->mod = NULL;
+
+ for (func = obj->funcs; func->old_name; func++)
+ func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+ struct klp_object *limit)
+{
+ struct klp_object *obj;
+
+ for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+ klp_free_funcs_limited(obj, NULL);
+ kobject_put(obj->kobj);
+ }
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+ klp_free_objects_limited(patch, NULL);
+ if (!list_empty(&patch->list))
+ list_del(&patch->list);
+ kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+ INIT_LIST_HEAD(&func->stack_node);
+ func->state = KLP_DISABLED;
+
+ return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+ obj->kobj, func->old_name);
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (obj->relocs) {
+ ret = klp_write_object_relocations(patch->mod, obj);
+ if (ret)
+ return ret;
+ }
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_find_verify_func_addr(obj, func);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+ const char *name;
+
+ if (!obj->funcs)
+ return -EINVAL;
+
+ obj->state = KLP_DISABLED;
+
+ klp_find_object_module(obj);
+
+ name = klp_is_module(obj) ? obj->name : "vmlinux";
+ obj->kobj = kobject_create_and_add(name, &patch->kobj);
+ if (!obj->kobj)
+ return -ENOMEM;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_init_func(obj, func);
+ if (ret)
+ goto free;
+ }
+
+ if (klp_is_object_loaded(obj)) {
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ return 0;
+
+free:
+ klp_free_funcs_limited(obj, func);
+ kobject_put(obj->kobj);
+ return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (!patch->objs)
+ return -EINVAL;
+
+ mutex_lock(&klp_mutex);
+
+ patch->state = KLP_DISABLED;
+
+ ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+ klp_root_kobj, patch->mod->name);
+ if (ret)
+ goto unlock;
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ ret = klp_init_object(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ list_add_tail(&patch->list, &klp_patches);
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+free:
+ klp_free_objects_limited(patch, obj);
+ kobject_put(&patch->kobj);
+unlock:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch: Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+ int ret = 0;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (patch->state == KLP_ENABLED) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ klp_free_patch(patch);
+
+out:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch: Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ if (!klp_initialized())
+ return -ENODEV;
+
+ if (!patch || !patch->mod)
+ return -EINVAL;
+
+ /*
+ * A reference is taken on the patch module to prevent it from being
+ * unloaded. Right now, we don't allow patch modules to unload since
+ * there is currently no method to determine if a thread is still
+ * running in the patched code contained in the patch module once
+ * the ftrace registration is successful.
+ */
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ ret = klp_init_patch(patch);
+ if (ret)
+ module_put(patch->mod);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+ int ret;
+
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto err;
+
+ if (patch->state == KLP_DISABLED)
+ return;
+
+ pr_notice("applying patch '%s' to loading module '%s'\n",
+ pmod->name, mod->name);
+
+ ret = klp_enable_object(obj);
+ if (!ret)
+ return;
+
+err:
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+ int ret;
+
+ if (patch->state == KLP_DISABLED)
+ goto disabled;
+
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ pmod->name, mod->name);
+
+ ret = klp_disable_object(obj);
+ if (ret)
+ pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+
+disabled:
+ klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct module *mod = data;
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+ return 0;
+
+ mutex_lock(&klp_mutex);
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ if (action == MODULE_STATE_COMING) {
+ obj->mod = mod;
+ klp_module_notify_coming(patch, obj);
+ } else /* MODULE_STATE_GOING */
+ klp_module_notify_going(patch, obj);
+
+ break;
+ }
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+ .notifier_call = klp_module_notify,
+ .priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+ int ret;
+
+ ret = klp_check_compiler_support();
+ if (ret) {
+ pr_info("Your compiler is too old; turning off.\n");
+ return -EINVAL;
+ }
+
+ ret = register_module_notifier(&klp_module_nb);
+ if (ret)
+ return ret;
+
+ klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+ if (!klp_root_kobj) {
+ ret = -ENOMEM;
+ goto unregister;
+ }
+
+ return 0;
+
+unregister:
+ unregister_module_notifier(&klp_module_nb);
+ return ret;
+}
+
+module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 4caca3f7af53..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mcs_spin_unlock_contended(&next->locked);
}
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-
-struct optimistic_spin_node {
- struct optimistic_spin_node *next, *prev;
- int locked; /* 1 if lock acquired */
- int cpu; /* encoded CPU # value */
-};
-
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
-
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- mutex_clear_owner(lock);
}
/*
* __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
* mutexes so that we can do it here after we've verified state.
*/
+ mutex_clear_owner(lock);
atomic_set(&lock->count, 1);
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
* The mutex must later on be released by the same task that
* acquired it. Recursive locking is not allowed. The task
* may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
+ * memory where the mutex resides must not be freed with
* the mutex still locked. The mutex must first be initialized
* (or statically defined) before it can be locked. memset()-ing
* the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
}
/*
- * after acquiring lock with fastpath or when we lost out in contested
+ * After acquiring lock with fastpath or when we lost out in contested
* slowpath, set ctx and wake up any waiters so they can recheck.
*
* This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
spin_unlock_mutex(&lock->base.wait_lock, flags);
}
-
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * In order to avoid a stampede of mutex spinners from acquiring the mutex
- * more or less simultaneously, the spinners need to acquire a MCS lock
- * first before spinning on the owner field.
+ * After acquiring lock in the slowpath set ctx and wake up any
+ * waiters so they can recheck.
*
+ * Callers must hold the mutex wait_lock.
*/
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ struct mutex_waiter *cur;
-/*
- * Mutex spinning code migrated from kernel/sched/core.c
- */
+ ww_mutex_lock_acquired(lock, ctx);
+ lock->ctx = ctx;
+
+ /*
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
+ */
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
if (!mutex_can_spin_on_owner(lock))
goto done;
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
if (!osq_lock(&lock->osq))
goto done;
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
EXPORT_SYMBOL(ww_mutex_unlock);
static inline int __sched
-__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
if (use_ww_ctx && ww_ctx->acquired > 0) {
- ret = __mutex_lock_check_stamp(lock, ww_ctx);
+ ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
if (ret)
goto err;
}
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
schedule_preempt_disabled();
spin_lock_mutex(&lock->wait_lock, flags);
}
+ __set_task_state(task, TASK_RUNNING);
+
mutex_remove_waiter(lock, &waiter, current_thread_info());
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
if (use_ww_ctx) {
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct mutex_waiter *cur;
-
- /*
- * This branch gets optimized out for the common case,
- * and is only important for ww_mutex_lock.
- */
- ww_mutex_lock_acquired(ww, ww_ctx);
- ww->ctx = ww_ctx;
-
- /*
- * Give any possible sleeping processes the chance to wake up,
- * so they can recheck if they have to back off.
- */
- list_for_each_entry(cur, &lock->wait_list, list) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
+ ww_mutex_set_context_slowpath(ww, ww_ctx);
}
spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
#include <linux/percpu.h>
#include <linux/sched.h>
-#include "mcs_spinlock.h"
-
-#ifdef CONFIG_SMP
+#include <linux/osq_lock.h>
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!smp_load_acquire(&node->locked)) {
+ while (!ACCESS_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
*/
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
if (next)
ACCESS_ONCE(next->locked) = 1;
}
-
-#endif
-
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..3059bc2f022d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
}
+ __set_current_state(TASK_RUNNING);
return ret;
}
@@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
if (likely(!ret))
+ /* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
- set_current_state(TASK_RUNNING);
-
if (unlikely(ret)) {
remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, chwalk, &waiter);
@@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
set_current_state(TASK_INTERRUPTIBLE);
+ /* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- set_current_state(TASK_RUNNING);
-
if (unlikely(ret))
remove_waiter(lock, waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
}
- tsk->state = TASK_RUNNING;
+ __set_task_state(tsk, TASK_RUNNING);
out:
;
}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
schedule();
}
- tsk->state = TASK_RUNNING;
-
+ __set_task_state(tsk, TASK_RUNNING);
return sem;
}
EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_raw_spin_lock_nested);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
int subclass)
{
diff --git a/kernel/module.c b/kernel/module.c
index e52a8739361a..d856e96a3cce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
-#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
@@ -98,7 +97,7 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
* Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-int register_module_notifier(struct notifier_block * nb)
+int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);
-int unregister_module_notifier(struct notifier_block * nb)
+int unregister_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr)
- return -ENOMEM;
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- raw_cpu_write(mod->refptr->incs, 1);
+ atomic_inc(&mod->refcnt);
return 0;
}
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
-
- free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,48 @@ static inline int try_force_unload(unsigned int flags)
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
{
- struct module *mod;
- int flags;
- int *forced;
-};
+ int ret;
-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
- struct stopref *sref = _sref;
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+ return ret;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
/* If it's not unused, quit unless we're forcing. */
- if (module_refcount(sref->mod) != 0) {
- if (!(*sref->forced = try_force_unload(sref->flags)))
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
return -EWOULDBLOCK;
}
/* Mark it as dying. */
- sref->mod->state = MODULE_STATE_GOING;
- return 0;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- struct stopref sref = { mod, flags, forced };
+ mod->state = MODULE_STATE_GOING;
- return stop_machine(__try_stop_module, &sref, NULL);
+ return 0;
}
-unsigned long module_refcount(struct module *mod)
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod: the module we're checking
+ *
+ * Returns:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
{
- unsigned long incs = 0, decs = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- decs += per_cpu_ptr(mod->refptr, cpu)->decs;
- /*
- * ensure the incs are added up after the decs.
- * module_put ensures incs are visible before decs with smp_wmb.
- *
- * This 2-count scheme avoids the situation where the refcount
- * for CPU0 is read, then CPU0 increments the module refcount,
- * then CPU1 drops that refcount, then the refcount for CPU1 is
- * read. We would record a decrement but not its corresponding
- * increment so we would see a low count (disaster).
- *
- * Rare situation? But module_refcount can be preempted, and we
- * might be tallying up 4096+ CPUs. So it is not impossible.
- */
- smp_rmb();
- for_each_possible_cpu(cpu)
- incs += per_cpu_ptr(mod->refptr, cpu)->incs;
- return incs - decs;
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);
@@ -875,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
struct module_use *use;
int printed_something = 0;
- seq_printf(m, " %lu ", module_refcount(mod));
+ seq_printf(m, " %i ", module_refcount(mod));
- /* Always include a trailing , so userspace can differentiate
- between this and the old multi-field proc format. */
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
list_for_each_entry(use, &mod->source_list, source_list) {
printed_something = 1;
seq_printf(m, "%s,", use->source->name);
@@ -886,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
if (mod->init != NULL && mod->exit == NULL) {
printed_something = 1;
- seq_printf(m, "[permanent],");
+ seq_puts(m, "[permanent],");
}
if (!printed_something)
- seq_printf(m, "-");
+ seq_puts(m, "-");
}
void __symbol_put(const char *symbol)
@@ -925,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
static ssize_t show_refcnt(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
}
static struct module_attribute modinfo_refcnt =
@@ -935,7 +927,7 @@ void __module_get(struct module *module)
{
if (module) {
preempt_disable();
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
preempt_enable();
}
@@ -948,11 +940,11 @@ bool try_module_get(struct module *module)
if (module) {
preempt_disable();
-
- if (likely(module_is_live(module))) {
- __this_cpu_inc(module->refptr->incs);
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
- } else
+ else
ret = false;
preempt_enable();
@@ -963,11 +955,12 @@ EXPORT_SYMBOL(try_module_get);
void module_put(struct module *module)
{
+ int ret;
+
if (module) {
preempt_disable();
- smp_wmb(); /* see comment in module_refcount */
- __this_cpu_inc(module->refptr->decs);
-
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
preempt_enable();
}
@@ -978,7 +971,7 @@ EXPORT_SYMBOL(module_put);
static inline void print_unload_info(struct seq_file *m, struct module *mod)
{
/* We don't know the usage count, or what modules are using. */
- seq_printf(m, " - -");
+ seq_puts(m, " - -");
}
static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc,
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1165,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs,
return 0;
bad_version:
- printk("%s: disagrees about version of symbol %s\n",
+ pr_warn("%s: disagrees about version of symbol %s\n",
mod->name, symname);
return 0;
}
@@ -1200,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1288,15 +1281,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
}
-struct module_sect_attr
-{
+struct module_sect_attr {
struct module_attribute mattr;
char *name;
unsigned long address;
};
-struct module_sect_attrs
-{
+struct module_sect_attrs {
struct attribute_group grp;
unsigned int nsections;
struct module_sect_attr attrs[0];
@@ -1550,7 +1541,8 @@ static int module_add_modinfo_attrs(struct module *mod)
(attr->test && attr->test(mod))) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
++temp_attr;
}
}
@@ -1566,7 +1558,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
/* pick a field to test for end of list */
if (!attr->attr.name)
break;
- sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
if (attr->free)
attr->free(mod);
}
@@ -1697,18 +1689,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
- struct module *mod = _mod;
- list_del(&mod->list);
- module_bug_cleanup(mod);
- return 0;
-}
-
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
/*
* LKM RO/NX protection: protect module's text/ro-data
@@ -1824,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
static void unset_module_init_ro_nx(struct module *mod) { }
#endif
-void __weak module_free(struct module *mod, void *module_region)
+void __weak module_memfree(void *module_region)
{
vfree(module_region);
}
@@ -1833,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod)
{
}
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1860,12 +1844,18 @@ static void free_module(struct module *mod)
/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
@@ -1874,7 +1864,7 @@ static void free_module(struct module *mod)
/* Finally, free the core (containing the module structure) */
unset_module_core_ro_nx(mod);
- module_free(mod, mod->module_core);
+ module_memfree(mod->module_core);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -1955,7 +1945,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
/* We compiled with -fno-common. These are not
supposed to happen. */
pr_debug("Common symbol: %s\n", name);
- printk("%s: please compile with -fno-common\n",
+ pr_warn("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
break;
@@ -2259,7 +2249,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum)
{
const Elf_Shdr *sec;
@@ -2735,7 +2725,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
* This shouldn't happen with same compiler and binutils
* building all parts of the module.
*/
- printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+ pr_warn("%s: has both .ctors and .init_array.\n",
mod->name);
return -EINVAL;
}
@@ -2809,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_free(mod, mod->module_core);
+ module_memfree(mod->module_core);
return -ENOMEM;
}
memset(ptr, 0, mod->init_size);
@@ -2954,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
- module_free(mod, mod->module_init);
- module_free(mod, mod->module_core);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
+ module_memfree(mod->module_core);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3007,10 +2998,31 @@ static void do_mod_ctors(struct module *mod)
#endif
}
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct rcu_head rcu;
+ void *module_init;
+};
+
+static void do_free_init(struct rcu_head *head)
+{
+ struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+ module_memfree(m->module_init);
+ kfree(m);
+}
+
/* This is where the real work happens */
static int do_init_module(struct module *mod)
{
int ret = 0;
+ struct mod_initfree *freeinit;
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->module_init = mod->module_init;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -3023,16 +3035,7 @@ static int do_init_module(struct module *mod)
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
+ goto fail_free_freeinit;
}
if (ret > 0) {
pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3077,15 +3080,35 @@ static int do_init_module(struct module *mod)
mod->strtab = mod->core_strtab;
#endif
unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
+ module_arch_freeing_init(mod);
mod->module_init = NULL;
mod->init_size = 0;
mod->init_ro_size = 0;
mod->init_text_size = 0;
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this with preempt disabled. In all the failure paths,
+ * we call synchronize_rcu/synchronize_sched, but we don't want
+ * to slow down the success path, so use actual RCU here.
+ */
+ call_rcu(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
}
static int may_init_module(void)
@@ -3202,7 +3225,7 @@ out:
static int unknown_module_param_cb(char *param, char *val, const char *modname)
{
- /* Check for magic 'dyndbg' arg */
+ /* Check for magic 'dyndbg' arg */
int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3352,6 +3375,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
module_deallocate(mod, info);
@@ -3685,8 +3710,8 @@ static int m_show(struct seq_file *m, void *p)
/* Informative for users. */
seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading":
- mod->state == MODULE_STATE_COMING ? "Loading":
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
seq_printf(m, " 0x%pK", mod->module_core);
@@ -3695,7 +3720,7 @@ static int m_show(struct seq_file *m, void *p)
if (mod->taints)
seq_printf(m, " %s", module_flags(mod, buf));
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
return 0;
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+#endif /* CONFIG_SRCU */
+
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
{ TAINT_OOT_MODULE, 'O', ' ' },
{ TAINT_UNSIGNED_MODULE, 'E', ' ' },
{ TAINT_SOFTLOCKUP, 'L', ' ' },
+ { TAINT_LIVEPATCH, 'K', ' ' },
};
/**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
* 'O' - Out-of-tree module has been loaded.
* 'E' - Unsigned module has been loaded.
* 'L' - A soft lockup has previously occurred.
+ * 'K' - Kernel has been live patched.
*
* The string is overwritten by the next call to print_tainted().
*/
diff --git a/kernel/params.c b/kernel/params.c
index db97b791390f..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -603,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
const struct kernel_param *kp,
const char *name)
{
- struct module_param_attrs *new;
- struct attribute **attrs;
- int err, num;
+ struct module_param_attrs *new_mp;
+ struct attribute **new_attrs;
+ unsigned int i;
/* We don't bother calling this with invisible parameters. */
BUG_ON(!kp->perm);
if (!mk->mp) {
- num = 0;
- attrs = NULL;
- } else {
- num = mk->mp->num;
- attrs = mk->mp->grp.attrs;
+ /* First allocation. */
+ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
+ if (!mk->mp)
+ return -ENOMEM;
+ mk->mp->grp.name = "parameters";
+ /* NULL-terminated attribute array. */
+ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
+ GFP_KERNEL);
+ /* Caller will cleanup via free_module_param_attrs */
+ if (!mk->mp->grp.attrs)
+ return -ENOMEM;
}
- /* Enlarge. */
- new = krealloc(mk->mp,
- sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
- GFP_KERNEL);
- if (!new) {
- kfree(attrs);
- err = -ENOMEM;
- goto fail;
- }
- /* Despite looking like the typical realloc() bug, this is safe.
- * We *want* the old 'attrs' to be freed either way, and we'll store
- * the new one in the success case. */
- attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
- if (!attrs) {
- err = -ENOMEM;
- goto fail_free_new;
- }
+ /* Enlarge allocations. */
+ new_mp = krealloc(mk->mp,
+ sizeof(*mk->mp) +
+ sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ GFP_KERNEL);
+ if (!new_mp)
+ return -ENOMEM;
+ mk->mp = new_mp;
- /* Sysfs wants everything zeroed. */
- memset(new, 0, sizeof(*new));
- memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
- memset(&attrs[num], 0, sizeof(attrs[num]));
- new->grp.name = "parameters";
- new->grp.attrs = attrs;
+ /* Extra pointer for NULL terminator */
+ new_attrs = krealloc(mk->mp->grp.attrs,
+ sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
+ GFP_KERNEL);
+ if (!new_attrs)
+ return -ENOMEM;
+ mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- sysfs_attr_init(&new->attrs[num].mattr.attr);
- new->attrs[num].param = kp;
- new->attrs[num].mattr.show = param_attr_show;
- new->attrs[num].mattr.store = param_attr_store;
- new->attrs[num].mattr.attr.name = (char *)name;
- new->attrs[num].mattr.attr.mode = kp->perm;
- new->num = num+1;
+ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
+ mk->mp->attrs[mk->mp->num].param = kp;
+ mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ /* Do not allow runtime DAC changes to make param writable. */
+ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
+ mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ else
+ mk->mp->attrs[mk->mp->num].mattr.store = NULL;
+ mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
+ mk->mp->num++;
/* Fix up all the pointers, since krealloc can move us */
- for (num = 0; num < new->num; num++)
- new->grp.attrs[num] = &new->attrs[num].mattr.attr;
- new->grp.attrs[num] = NULL;
-
- mk->mp = new;
+ for (i = 0; i < mk->mp->num; i++)
+ mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
+ mk->mp->grp.attrs[mk->mp->num] = NULL;
return 0;
-
-fail_free_new:
- kfree(new);
-fail:
- mk->mp = NULL;
- return err;
}
#ifdef CONFIG_MODULES
static void free_module_param_attrs(struct module_kobject *mk)
{
- kfree(mk->mp->grp.attrs);
+ if (mk->mp)
+ kfree(mk->mp->grp.attrs);
kfree(mk->mp);
mk->mp = NULL;
}
@@ -695,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod,
if (kparam[i].perm == 0)
continue;
err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
- if (err)
+ if (err) {
+ free_module_param_attrs(&mod->mkobj);
return err;
+ }
params = true;
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6e7708c2c21f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,7 +94,7 @@ config PM_STD_PARTITION
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
- select PM_RUNTIME
+ select PM
config PM_SLEEP_SMP
def_bool y
@@ -130,23 +130,19 @@ config PM_WAKELOCKS_GC
depends on PM_WAKELOCKS
default y
-config PM_RUNTIME
- bool "Run-time PM core functionality"
+config PM
+ bool "Device power management core functionality"
---help---
Enable functionality allowing I/O devices to be put into energy-saving
- (low power) states at run time (or autosuspended) after a specified
- period of inactivity and woken up in response to a hardware-generated
+ (low power) states, for example after a specified period of inactivity
+ (autosuspended), and woken up in response to a hardware-generated
wake-up event or a driver's request.
Hardware support is generally required for this functionality to work
and the bus type drivers of the buses the devices are on are
- responsible for the actual handling of the autosuspend requests and
+ responsible for the actual handling of device suspend requests and
wake-up events.
-config PM
- def_bool y
- depends on PM_SLEEP || PM_RUNTIME
-
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
@@ -255,6 +251,7 @@ config APM_EMULATION
config PM_OPP
bool
+ select SRCU
---help---
SOCs have a standard set of tuples consisting of frequency and
voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
#include <linux/platform_device.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
+static inline int pm_qos_get_value(struct pm_qos_constraints *c);
+static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+{
+ struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
+ struct pm_qos_constraints *c;
+ struct pm_qos_request *req;
+ char *type;
+ unsigned long flags;
+ int tot_reqs = 0;
+ int active_reqs = 0;
+
+ if (IS_ERR_OR_NULL(qos)) {
+ pr_err("%s: bad qos param!\n", __func__);
+ return -EINVAL;
+ }
+ c = qos->constraints;
+ if (IS_ERR_OR_NULL(c)) {
+ pr_err("%s: Bad constraints on qos?\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Lock to ensure we have a snapshot */
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ if (plist_head_empty(&c->list)) {
+ seq_puts(s, "Empty!\n");
+ goto out;
+ }
+
+ switch (c->type) {
+ case PM_QOS_MIN:
+ type = "Minimum";
+ break;
+ case PM_QOS_MAX:
+ type = "Maximum";
+ break;
+ case PM_QOS_SUM:
+ type = "Sum";
+ break;
+ default:
+ type = "Unknown";
+ }
+
+ plist_for_each_entry(req, &c->list, node) {
+ char *state = "Default";
+
+ if ((req->node).prio != c->default_value) {
+ active_reqs++;
+ state = "Active";
+ }
+ tot_reqs++;
+ seq_printf(s, "%d: %d: %s\n", tot_reqs,
+ (req->node).prio, state);
+ }
+
+ seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
+ type, pm_qos_get_value(c), active_reqs, tot_reqs);
+
+out:
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ return 0;
+}
+
+static int pm_qos_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pm_qos_dbg_show_requests,
+ inode->i_private);
+}
+
+static const struct file_operations pm_qos_debug_fops = {
+ .open = pm_qos_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/**
* pm_qos_update_target - manages the constraints list and calls the notifiers
* if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
/* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
{
qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+ if (d) {
+ (void)debugfs_create_file(qos->name, S_IRUGO, d,
+ (void *)qos, &pm_qos_debug_fops);
+ }
+
return misc_register(&qos->pm_qos_power_miscdev);
}
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
{
int ret = 0;
int i;
+ struct dentry *d;
BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+ d = debugfs_create_dir("pm_qos", NULL);
+ if (IS_ERR_OR_NULL(d))
+ d = NULL;
+
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
- ret = register_pm_qos_misc(pm_qos_array[i]);
+ ret = register_pm_qos_misc(pm_qos_array[i], d);
if (ret < 0) {
printk(KERN_ERR "pm_qos_param: %s setup failed\n",
pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
/**
* free_unnecessary_pages - Release preallocated pages not needed for the image
*/
-static void free_unnecessary_pages(void)
+static unsigned long free_unnecessary_pages(void)
{
- unsigned long save, to_free_normal, to_free_highmem;
+ unsigned long save, to_free_normal, to_free_highmem, free;
save = count_data_pages();
if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
else
to_free_normal = 0;
}
+ free = to_free_normal + to_free_highmem;
memory_bm_position_reset(&copy_bm);
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
swsusp_unset_page_free(page);
__free_page(page);
}
+
+ return free;
}
/**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
* pages in memory, but we have allocated more. Release the excessive
* ones now.
*/
- free_unnecessary_pages();
+ pages -= free_unnecessary_pages();
out:
stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static inline int get_safe_write_buffer(void) { return 0; }
-
static unsigned int
count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
const struct range *r2 = x2;
- s64 start1, start2;
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
+ if (r1->start < r2->start)
+ return -1;
+ if (r1->start > r2->start)
+ return 1;
+ return 0;
}
int clean_sort_range(struct range *range, int az)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
void rcu_early_boot_tests(void);
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
- int (*completed)(void);
+ unsigned long (*started)(void);
+ unsigned long (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
rcu_read_unlock();
}
-static int rcu_torture_completed(void)
-{
- return rcu_batches_completed();
-}
-
/*
* Update callback in the pipe. This should be invoked after a grace period.
*/
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
cur_ops->deferred_free(rp);
}
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
{
return 0;
}
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
+ .started = rcu_batches_started,
+ .completed = rcu_batches_completed,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
rcu_read_unlock_bh();
}
-static int rcu_bh_torture_completed(void)
-{
- return rcu_batches_completed_bh();
-}
-
static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
{
call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
+ .started = rcu_batches_started_bh,
+ .completed = rcu_batches_completed_bh,
.deferred_free = rcu_bh_torture_deferred_free,
.sync = synchronize_rcu_bh,
.exp_sync = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .started = rcu_no_completed,
.completed = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
srcu_read_unlock(&srcu_ctl, idx);
}
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(&srcu_ctl);
}
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .started = NULL,
.completed = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
+ .started = rcu_batches_started_sched,
+ .completed = rcu_batches_completed_sched,
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
.readlock = tasks_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = tasks_torture_read_unlock,
+ .started = rcu_no_completed,
.completed = rcu_no_completed,
.deferred_free = rcu_tasks_torture_deferred_free,
.sync = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
static void rcu_torture_timer(unsigned long unused)
{
int idx;
- int completed;
- int completed_end;
+ unsigned long started;
+ unsigned long completed;
static DEFINE_TORTURE_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
unsigned long long ts;
idx = cur_ops->readlock();
- completed = cur_ops->completed();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed_end = cur_ops->completed();
+ completed = cur_ops->completed();
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
- completed, completed_end);
+ started, completed);
rcutorture_trace_dump();
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed_end - completed;
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
static int
rcu_torture_reader(void *arg)
{
- int completed;
- int completed_end;
+ unsigned long started;
+ unsigned long completed;
int idx;
DEFINE_TORTURE_RANDOM(rand);
struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
mod_timer(&t, jiffies + 1);
}
idx = cur_ops->readlock();
- completed = cur_ops->completed();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed_end = cur_ops->completed();
+ completed = cur_ops->completed();
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, completed, completed_end);
+ ts, started, completed);
rcutorture_trace_dump();
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed_end - completed;
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
n_rcu_torture_barrier_error++;
+ pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+ atomic_read(&barrier_cbs_invoked),
+ n_barrier_cbs);
WARN_ON_ONCE(1);
}
n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
* Report the number of batches, correlated with, but not necessarily
* precisely the same as, the number of grace periods that have elapsed.
*/
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
{
return sp->completed;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-
#include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
- if (newval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("--="),
- rcu_dynticks_nesting, newval));
- rcu_dynticks_nesting = newval;
- return;
- }
- RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
- rcu_dynticks_nesting, newval));
- if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
- RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
- rcu_dynticks_nesting, newval));
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
- rcu_sched_qs(); /* implies rcu_bh_inc() */
- barrier();
- rcu_dynticks_nesting = newval;
-}
-
/*
* Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
- long long newval;
-
- local_irq_save(flags);
- WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
- if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE)
- newval = 0;
- else
- newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
- rcu_idle_enter_common(newval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_irq_exit(void)
{
- unsigned long flags;
- long long newval;
-
- local_irq_save(flags);
- newval = rcu_dynticks_nesting - 1;
- WARN_ON_ONCE(newval < 0);
- rcu_idle_enter_common(newval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
- if (oldval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("++="),
- oldval, rcu_dynticks_nesting));
- return;
- }
- RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
- if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
- RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
- oldval, rcu_dynticks_nesting));
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
-}
-
/*
* Exit idle, so that we are no longer in an extended quiescent state.
*/
void rcu_idle_exit(void)
{
- unsigned long flags;
- long long oldval;
-
- local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- WARN_ON_ONCE(rcu_dynticks_nesting < 0);
- if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
- rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- else
- rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_idle_exit_common(oldval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void rcu_irq_enter(void)
{
- unsigned long flags;
- long long oldval;
-
- local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- rcu_dynticks_nesting++;
- WARN_ON_ONCE(rcu_dynticks_nesting == 0);
- rcu_idle_exit_common(oldval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_irq_enter);
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
*/
bool notrace __rcu_is_watching(void)
{
- return rcu_dynticks_nesting;
+ return true;
}
EXPORT_SYMBOL(__rcu_is_watching);
#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
/*
- * Test whether the current CPU was interrupted from idle. Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
- return rcu_dynticks_nesting <= 1;
-}
-
-/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
* Also irqs are disabled to avoid confusion due to interrupt handlers
* invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
void rcu_check_callbacks(int user)
{
RCU_TRACE(check_cpu_stalls());
- if (user || rcu_is_cpu_rrupt_from_idle())
+ if (user)
rcu_sched_qs();
else if (!in_softirq())
rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
rcp->curtail = &head->next;
RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_sched_qs() */
+ resched_cpu(0);
+ }
}
/*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
rcp->ticks_this_gp++;
j = jiffies;
js = ACCESS_ONCE(rcp->jiffies_stall);
- if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+ if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+ rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
jiffies - rcp->gp_start, rcp->qlen);
dump_stack();
- }
- if (*rcp->curtail && ULONG_CMP_GE(j, js))
ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
- else if (ULONG_CMP_GE(j, js))
+ } else if (ULONG_CMP_GE(j, js)) {
ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ }
}
static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
/*
* Track the rcutorture test sequence number and the update version
* number within a given test. The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
/*
* Let the RCU core know that this CPU has gone through the scheduler,
* which is a quiescent state. This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+/*
+ * Register a quiesecent state for all RCU flavors. If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them). Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
+ this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
/*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+ return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+ return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
*/
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+ return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+ return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
{
return rcu_sched_state.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
/*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
*/
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
{
return rcu_bh_state.completed;
}
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
/**
* rcu_nmi_enter - inform RCU of entry to NMI context
*
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
*/
void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int incby = 2;
- if (rdtp->dynticks_nmi_nesting == 0 &&
- (atomic_read(&rdtp->dynticks) & 0x1))
- return;
- rdtp->dynticks_nmi_nesting++;
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ /* Complain about underflow. */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+ smp_mb__before_atomic(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* atomic_inc() before later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ incby = 1;
+ }
+ rdtp->dynticks_nmi_nesting += incby;
+ barrier();
}
/**
* rcu_nmi_exit - inform RCU of exit from NMI context
*
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
*/
void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- if (rdtp->dynticks_nmi_nesting == 0 ||
- --rdtp->dynticks_nmi_nesting != 0)
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (rdtp->dynticks_nmi_nesting != 1) {
+ rdtp->dynticks_nmi_nesting -= 2;
return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ rdtp->dynticks_nmi_nesting = 0;
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
+ if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ rdp->mynode->gpnum))
+ ACCESS_ONCE(rdp->gpwrap) = true;
return 0;
}
}
/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
-/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
j1 = rcu_jiffies_till_stall_check();
ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
rsp->jiffies_resched = j + j1 / 2;
+ rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+ unsigned long gpa;
+ unsigned long j;
+
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ if (j - gpa > 2 * HZ)
+ pr_err("%s kthread starved for %ld jiffies!\n",
+ rsp->name, j - gpa);
}
/*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
}
}
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
long delta;
unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- /*
- * Now rat on any tasks that got kicked up to the root rcu_node
- * due to CPU offlining.
- */
- rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected += rcu_print_task_stall(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (ndetected == 0)
- pr_err("INFO: Stall ended before state dump start\n");
- else
+ if (ndetected) {
rcu_dump_cpu_stacks(rsp);
+ } else {
+ if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+ ACCESS_ONCE(rsp->completed) == gpnum) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+ rsp->name, j - gpa, j, gpa,
+ jiffies_till_next_fqs);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
/* Complain about tasks blocking the grace period. */
-
rcu_print_detail_task_stall(rsp);
+ rcu_check_gp_kthread_starvation(rsp);
+
force_quiescent_state(rsp); /* Kick them all. */
}
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
+
+ rcu_check_gp_kthread_starvation(rsp);
+
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
/* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(rsp);
+ print_other_cpu_stall(rsp, gpnum);
}
}
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
/* Handle the ends of any preceding grace periods first. */
- if (rdp->completed == rnp->completed) {
+ if (rdp->completed == rnp->completed &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
/* No grace period end, so just accelerate recent callbacks. */
ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
- if (rdp->gpnum != rnp->gpnum) {
+ if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
rdp->passed_quiesce = 0;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
+ ACCESS_ONCE(rdp->gpwrap) = false;
}
return ret;
}
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_save(flags);
rnp = rdp->mynode;
if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+ rdp->completed == ACCESS_ONCE(rnp->completed) &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
!raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
}
mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
if (rcu_gp_init(rsp))
break;
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
ACCESS_ONCE(rsp->gpnum),
TPS("fqsend"));
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
- rnp->completed == rnp->gpnum) {
+ if ((rdp->passed_quiesce == 0 &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+ rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+ rdp->gpwrap) {
/*
* The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* within the current grace period.
*/
rdp->passed_quiesce = 0; /* need qs for new gp. */
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (!rdp->passed_quiesce)
+ if (!rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
return;
/*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
}
/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ rnp->qsmaskinit &= ~mask;
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+}
+
+/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- unsigned long mask;
- int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
rcu_adopt_orphan_cbs(rsp, flags);
+ raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
- mask = rdp->grpmask; /* rnp->grplo is constant. */
- do {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rnp->qsmaskinit &= ~mask;
- if (rnp->qsmaskinit != 0) {
- if (rnp != rdp->mynode)
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- break;
- }
- if (rnp == rdp->mynode)
- need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
- else
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- mask = rnp->grpmask;
- rnp = rnp->parent;
- } while (rnp != NULL);
-
- /*
- * We still hold the leaf rcu_node structure lock here, and
- * irqs are still disabled. The reason for this subterfuge is
- * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
- * held leads to deadlock.
- */
- raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
- rnp = rdp->mynode;
- if (need_report & RCU_OFL_TASKS_NORM_GP)
- rcu_report_unblock_qs_rnp(rnp, flags);
- else
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (need_report & RCU_OFL_TASKS_EXP_GP)
- rcu_report_exp_rnp(rsp, rnp, true);
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinit &= ~rdp->grpmask;
+ if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
+ rcu_cleanup_dead_rnp(rnp);
+ rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
+
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
}
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- rnp = rcu_get_root(rsp);
- if (rnp->qsmask == 0) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- }
}
/*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
* Schedule RCU callback invocation. If the specified type of RCU
* does not support RCU priority boosting, just do a direct call,
* otherwise wake up the per-CPU kernel kthread. Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
* rcu_cpu_kthread_task cannot disappear out from under us.
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
- rdp->qs_pending && !rdp->passed_quiesce) {
+ rdp->qs_pending && !rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
rdp->n_rp_qs_pending++;
- } else if (rdp->qs_pending && rdp->passed_quiesce) {
+ } else if (rdp->qs_pending &&
+ (rdp->passed_quiesce ||
+ rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+ if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
rdp->n_rp_gp_started++;
return 1;
}
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
} else {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
rsp->n_barrier_done);
+ smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- init_callback_list(rdp);
- rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed;
rdp->completed = rnp->completed;
rdp->passed_quiesce = 0;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = 0;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
}
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
+ int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct rcu_state *rsp;
+ struct sched_param sp;
struct task_struct *t;
+ /* Force priority into range. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+ kthread_prio, kthread_prio_in);
+
rcu_scheduler_fully_active = 1;
for_each_rcu_flavor(rsp) {
- t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+ t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
rnp = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
rsp->gp_kthread = t;
+ if (kthread_prio) {
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ wake_up_process(t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
-#include <linux/irq_work.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
- struct completion boost_completion;
- /* Used to ensure that the rt_mutex used */
- /* to carry out the boosting is fully */
- /* released with no future boostee accesses */
- /* before that rt_mutex is re-initialized. */
struct rt_mutex boost_mtx;
/* Used only for the priority-boosting */
/* side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
+ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+ /* for rcu_all_qs() invocations. */
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
+ bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
#ifdef CONFIG_RCU_NOCB_CPU
struct rcu_head *nocb_head; /* CBs waiting for kthread. */
struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
- atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
+ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail;
- atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
- atomic_long_t nocb_follower_count_lazy; /* (approximate). */
- int nocb_p_count; /* # CBs being invoked by kthread */
- int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
/* CBs waiting for GP. */
struct rcu_head **nocb_gp_tail;
- long nocb_gp_count;
- long nocb_gp_count_lazy;
bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
struct rcu_data *nocb_next_follower;
/* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
/* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
+ unsigned long gp_activity; /* Time of last GP kthread */
+ /* activity in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
+ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
+ /* GP start. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
const char *name; /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
#define for_each_rcu_flavor(rsp) \
list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
-/* Return values for rcu_preempt_offline_tasks(). */
-
-#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
- /* GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
- /* GP were moved to root. */
-
/*
* RCU implementation internal declarations:
*/
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
static void rcu_preempt_note_context_switch(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
- unsigned long flags);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
#endif /* #ifndef RCU_TREE_NONCORE */
#ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
-/* Sum up queue lengths for tracing. */
+/* Read out queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) +
- rdp->nocb_p_count +
- atomic_long_read(&rdp->nocb_follower_count) +
- rdp->nocb_p_count + rdp->nocb_gp_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
- rdp->nocb_p_count_lazy +
- atomic_long_read(&rdp->nocb_follower_count_lazy) +
- rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
-}
+#ifdef CONFIG_RCU_NOCB_CPU
+ *ql = atomic_long_read(&rdp->nocb_q_count);
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
*ql = 0;
*qll = 0;
-}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..2e850a51bb8f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
#include "../locking/rtmutex_common.h"
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
-
/*
* Control variables for per-CPU and per-rcu_node kthreads. These
* handle all flavors of RCU.
@@ -103,6 +99,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state_p = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake);
/*
* Tell them what RCU they are running.
@@ -114,25 +112,6 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static long rcu_batches_completed_preempt(void)
-{
- return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
@@ -307,15 +286,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
}
/*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+ return !list_empty(&rnp->blkd_tasks);
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
void rcu_read_unlock_special(struct task_struct *t)
{
- int empty;
- int empty_exp;
- int empty_exp_now;
+ bool empty;
+ bool empty_exp;
+ bool empty_norm;
+ bool empty_exp_now;
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
break;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempt_blocked_readers_cgp(rnp);
+ empty = !rcu_preempt_has_tasks(rnp);
+ empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
+ * If this was the last task on the list, go see if we
+ * need to propagate ->qsmaskinit bit clearing up the
+ * rcu_node tree.
+ */
+ if (!empty && !rcu_preempt_has_tasks(rnp))
+ rcu_cleanup_dead_rnp(rnp);
+
+ /*
* If this was the last task on the current list, and if
* we aren't waiting on any CPUs, report the quiescent state.
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
empty_exp_now = !rcu_preempted_readers_exp(rnp);
- if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum,
0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (drop_boost_mutex) {
+ if (drop_boost_mutex)
rt_mutex_unlock(&rnp->boost_mtx);
- complete(&rnp->boost_completion);
- }
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (!list_empty(&rnp->blkd_tasks))
+ if (rcu_preempt_has_tasks(rnp))
rnp->gp_tasks = rnp->blkd_tasks.next;
WARN_ON_ONCE(rnp->qsmask);
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline. Move them up to the root
- * rcu_node. The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- struct list_head *lp;
- struct list_head *lp_root;
- int retval = 0;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
- struct task_struct *t;
-
- if (rnp == rnp_root) {
- WARN_ONCE(1, "Last CPU thought to be offlined?");
- return 0; /* Shouldn't happen: at least one CPU online. */
- }
-
- /* If we are on an internal node, complain bitterly. */
- WARN_ON_ONCE(rnp != rdp->mynode);
-
- /*
- * Move tasks up to root rcu_node. Don't try to get fancy for
- * this corner-case operation -- just put this node's tasks
- * at the head of the root node's list, and update the root node's
- * ->gp_tasks and ->exp_tasks pointers to those of this node's,
- * if non-NULL. This might result in waiting for more tasks than
- * absolutely necessary, but this is a good performance/complexity
- * tradeoff.
- */
- if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
- retval |= RCU_OFL_TASKS_NORM_GP;
- if (rcu_preempted_readers_exp(rnp))
- retval |= RCU_OFL_TASKS_EXP_GP;
- lp = &rnp->blkd_tasks;
- lp_root = &rnp_root->blkd_tasks;
- while (!list_empty(lp)) {
- t = list_entry(lp->next, typeof(*t), rcu_node_entry);
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- list_del(&t->rcu_node_entry);
- t->rcu_blocked_node = rnp_root;
- list_add(&t->rcu_node_entry, lp_root);
- if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp_root->gp_tasks = rnp->gp_tasks;
- if (&t->rcu_node_entry == rnp->exp_tasks)
- rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
- }
-
- rnp->gp_tasks = NULL;
- rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
- rnp->boost_tasks = NULL;
- /*
- * In case root is being boosted and leaf was not. Make sure
- * that we boost the tasks blocking the current grace period
- * in this case.
- */
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- if (rnp_root->boost_tasks != NULL &&
- rnp_root->boost_tasks != rnp_root->gp_tasks &&
- rnp_root->boost_tasks != rnp_root->exp_tasks)
- rnp_root->boost_tasks = rnp_root->gp_tasks;
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- return retval;
-}
-
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (list_empty(&rnp->blkd_tasks)) {
+ if (!rcu_preempt_has_tasks(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else {
rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
#ifdef CONFIG_HOTPLUG_CPU
-/* Because preemptible RCU does not exist, no quieting of tasks. */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return false;
}
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
WARN_ON_ONCE(rnp->qsmask);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- return 0;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
static void rcu_initiate_boost_trace(struct rcu_node *rnp)
{
- if (list_empty(&rnp->blkd_tasks))
+ if (!rcu_preempt_has_tasks(rnp))
rnp->n_balk_blkd_tasks++;
else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
struct task_struct *t;
struct list_head *tb;
- if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+ if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+ ACCESS_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
- init_completion(&rnp->boost_completion);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
- /* Wait for boostee to be done w/boost_mtx before reinitializing. */
- wait_for_completion(&rnp->boost_completion);
-
return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
ACCESS_ONCE(rnp->boost_tasks) != NULL;
}
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0) {
+ if (cpumask_weight(cm) == 0)
cpumask_setall(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
- cpumask_clear_cpu(cpu, cm);
- WARN_ON_ONCE(cpumask_weight(cm) == 0);
- }
set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
- rnp = rcu_get_root(rcu_state_p);
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
- if (NUM_RCU_NODES > 1) {
- rcu_for_each_leaf_node(rcu_state_p, rnp)
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
- }
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* completed since we last checked and there are
* callbacks not yet ready to invoke.
*/
- if (rdp->completed != rnp->completed &&
+ if ((rdp->completed != rnp->completed ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+ pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
- /* No-CBs CPUs might have callbacks on any of three lists. */
+ /*
+ * Check count of all no-CBs callbacks awaiting invocation.
+ * There needs to be a barrier before this function is called,
+ * but associated with a prior determination that no more
+ * callbacks would be posted. In the worst case, the first
+ * barrier in _rcu_barrier() suffices (but the caller cannot
+ * necessarily rely on this, not a substitute for the caller
+ * getting the concurrency design right!). There must also be
+ * a barrier between the following load an posting of a callback
+ * (if a callback is in fact needed). This is associated with an
+ * atomic_inc() in the caller.
+ */
+ ret = atomic_long_read(&rdp->nocb_q_count);
+
+#ifdef CONFIG_PROVE_RCU
rhp = ACCESS_ONCE(rdp->nocb_head);
if (!rhp)
rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
cpu, rhp->func);
WARN_ON_ONCE(1);
}
+#endif /* #ifdef CONFIG_PROVE_RCU */
- return !!rhp;
+ return !!ret;
}
/*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
struct task_struct *t;
/* Enqueue the callback on the nocb list and update counts. */
+ atomic_long_add(rhcount, &rdp->nocb_q_count);
+ /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
ACCESS_ONCE(*old_rhpp) = rhp;
- atomic_long_add(rhcount, &rdp->nocb_q_count);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
@@ -2288,9 +2169,6 @@ wait_again:
/* Move callbacks to wait-for-GP list, which is empty. */
ACCESS_ONCE(rdp->nocb_head) = NULL;
rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
- rdp->nocb_gp_count_lazy =
- atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
gotcbs = true;
}
@@ -2338,9 +2216,6 @@ wait_again:
/* Append callbacks to follower's "done" list. */
tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
*tail = rdp->nocb_gp_head;
- atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
- atomic_long_add(rdp->nocb_gp_count_lazy,
- &rdp->nocb_follower_count_lazy);
smp_mb__after_atomic(); /* Store *tail before wakeup. */
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
- c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
- cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
- rdp->nocb_p_count += c;
- rdp->nocb_p_count_lazy += cl;
/* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+ trace_rcu_batch_start(rdp->rsp->name,
+ atomic_long_read(&rdp->nocb_q_count_lazy),
+ atomic_long_read(&rdp->nocb_q_count), -1);
c = cl = 0;
while (list) {
next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
- ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) =
- rdp->nocb_p_count_lazy - cl;
+ smp_mb__before_atomic(); /* _add after CB invocation. */
+ atomic_long_add(-c, &rdp->nocb_q_count);
+ atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
rdp->n_nocbs_invoked += c;
}
return 0;
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
#define RCU_TREE_NONCORE
#include "tree.h"
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+ seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->passed_quiesce, rdp->qs_pending);
+ rdp->passed_quiesce,
+ rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+ rdp->qs_pending);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/resource_ext.h>
#include <asm/io.h>
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
return err;
}
+struct resource_entry *resource_list_create_entry(struct resource *res,
+ size_t extra_size)
+{
+ struct resource_entry *entry;
+
+ entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+ if (entry) {
+ INIT_LIST_HEAD(&entry->node);
+ entry->res = res ? res : &entry->__res;
+ }
+
+ return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+
+void resource_list_free(struct list_head *head)
+{
+ struct resource_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, head, node)
+ resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..7052d3fd4e7b 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
unsigned long flags;
int ret = 1;
+ /*
+ * Since x->done will need to be locked only
+ * in the non-blocking case, we check x->done
+ * first without taking the lock so we can
+ * return early in the blocking case.
+ */
+ if (!ACCESS_ONCE(x->done))
+ return 0;
+
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
@@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
+ return !!ACCESS_ONCE(x->done);
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b78add6..1f37fe7f77a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- if (rq->skip_clock_update > 0)
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_skip_update & RQCF_ACT_SKIP)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
*/
void hrtick_start(struct rq *rq, u64 delay)
{
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ delay = max_t(u64, delay, 10000LL);
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
}
#ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+ perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
}
__set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
/*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SMP
+ p->se.avg.decay_count = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ init_dl_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
*/
static void __sched __schedule(void)
{
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
-need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+ if (task_on_rq_queued(prev))
update_rq_clock(rq);
next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->skip_clock_update = 0;
+ rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
post_schedule(rq);
sched_preempt_enable_no_resched();
- if (need_resched())
- goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
struct task_struct *tsk = current;
sched_submit_work(tsk);
- __schedule();
+ do {
+ __schedule();
+ } while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+static void preempt_schedule_common(void)
+{
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+}
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
if (likely(!preemptible()))
return;
- do {
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
- } while (need_resched());
+ preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
/*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+static bool dl_param_changed(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
@@ -3510,7 +3567,7 @@ recheck:
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
- if (dl_policy(policy))
+ if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-static void __cond_resched(void)
-{
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
int __sched _cond_resched(void)
{
if (should_resched()) {
- __cond_resched();
+ preempt_schedule_common();
return 1;
}
return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
- __cond_resched();
+ preempt_schedule_common();
else
cpu_relax();
ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
if (should_resched()) {
local_bh_enable();
- __cond_resched();
+ preempt_schedule_common();
local_bh_disable();
return 1;
}
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned state;
+ unsigned long state = p->state;
- state = p->state ? __ffs(p->state) + 1 : 0;
+ if (state)
+ state = __ffs(state) + 1;
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
struct dl_bw *cur_dl_b;
unsigned long flags;
+ if (!cpumask_weight(cur))
+ return ret;
+
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
+ if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7113,9 +7167,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
- alloc_size += num_possible_cpus() * cpumask_size();
-#endif
if (alloc_size) {
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -7135,13 +7186,13 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
+ }
#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (void *)ptr;
- ptr += cpumask_size();
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -7253,6 +7304,11 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7262,11 +7318,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
- /*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
@@ -7295,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- if (WARN_ONCE(current->state != TASK_RUNNING,
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
"state=%lx set at [<%p>] %pS\n",
current->state,
(void *)current->task_state_change,
- (void *)current->task_state_change))
- __set_current_state(TASK_RUNNING);
+ (void *)current->task_state_change);
___might_sleep(file, line, preempt_offset);
}
@@ -7328,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
in_atomic(), irqs_disabled(),
current->pid, current->comm);
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
- if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+ if (later_mask &&
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
}
/*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
if (!cp->elements)
return -ENOMEM;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
kfree(cp->elements);
return -ENOMEM;
}
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
for_each_possible_cpu(i)
cp->elements[i].idx = IDX_INVALID;
- cpumask_setall(cp->free_cpus);
-
return 0;
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e5db8c6feebd..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
}
/*
@@ -536,23 +541,19 @@ again:
sched_clock_tick();
update_rq_clock(rq);
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- if (task_on_rq_queued(p)) {
- enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
- */
- if (has_pushable_dl_tasks(rq))
- push_dl_task(rq);
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
#endif
- }
unlock:
raw_spin_unlock(&rq->lock);
@@ -570,24 +571,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
static
int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
{
- int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
- int rorun = dl_se->runtime <= 0;
-
- if (!rorun && !dmiss)
- return 0;
-
- /*
- * If we are beyond our current deadline and we are still
- * executing, then we have already used some of the runtime of
- * the next instance. Thus, if we do not account that, we are
- * stealing bandwidth from the system at each deadline miss!
- */
- if (dmiss) {
- dl_se->runtime = rorun ? dl_se->runtime : 0;
- dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
- }
-
- return 1;
+ return (dl_se->runtime <= 0);
}
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -630,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
+ dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
- dl_se->dl_throttled = 1;
- else
+ if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
- replenish_dl_entity(dl_se, pi_se);
- else
+ if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
+ else if (flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
__enqueue_dl_entity(dl_se);
}
@@ -870,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
*/
- if (p->dl.dl_throttled)
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
return;
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1090,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+ /*
+ * Even when we have runtime, update_curr_dl() might have resulted in us
+ * not being the leftmost task anymore. In that case NEED_RESCHED will
+ * be set and schedule() will start a new hrtick for the next task.
+ */
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(p, &rq->dl))
start_hrtick_dl(rq, p);
}
@@ -1111,6 +1100,7 @@ static void task_dead_dl(struct task_struct *p)
* Since we are TASK_DEAD we won't slip out of the domain!
*/
raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
@@ -1182,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- cpumask_copy(later_mask, task_rq(task)->rd->span);
- cpumask_and(later_mask, later_mask, cpu_active_mask);
- cpumask_and(later_mask, later_mask, &task->cpus_allowed);
best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
task, later_mask);
if (best_cpu == -1)
@@ -1579,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
+ cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
}
@@ -1590,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
dl_clear_overload(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
void init_sched_dl_class(void)
@@ -1631,8 +1620,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
+ /* XXX we should retain the bw until 0-lag */
cancel_dl_timer(rq, p);
-
__dl_clear_params(p);
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
+ PN(clock_task);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2cdf77f899..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
{
u32 slice;
- p->se.avg.decay_count = 0;
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
p->se.avg.runnable_avg_sum = slice;
p->se.avg.runnable_avg_period = slice;
@@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
nodes = node_online_map;
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
- nodemask_t max_group;
+ nodemask_t max_group = NODE_MASK_NONE;
int a, b;
/* Are there nodes at this distance from each other? */
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
+ se->avg.decay_count = 0;
if (!decays)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.decay_count = 0;
return decays;
}
@@ -4005,6 +4004,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ /* init_cfs_bandwidth() was not called */
+ if (!cfs_b->throttled_cfs_rq.next)
+ return;
+
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
@@ -4424,7 +4427,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * tg->shares) / W;
+ wl = (w * (long)tg->shares) / W;
else
wl = tg->shares;
@@ -5153,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
}
set_skip_buddy(se);
@@ -5945,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
*/
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
+ delta = __rq_clock_broken(rq) - age_stamp;
- delta = rq_clock(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..aaf1c1d5cf5d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- while (!tif_need_resched())
+ while (!tif_need_resched() &&
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
enqueue = 1;
/*
- * Force a clock update if the CPU was idle,
- * lest wakeup -> unthrottle time accumulate.
+ * When we're idle and a woken (rt) task is
+ * throttled check_preempt_curr() will set
+ * skip_update and the time between the wakeup
+ * and this unthrottle will get accounted as
+ * 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq->skip_clock_update = -1;
+ rq_clock_skip_update(rq, false);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
- if (target != -1)
+ /*
+ * Don't bother moving it if the destination CPU is
+ * not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
lowest_rq = cpu_rq(cpu);
+ if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rq = NULL;
+ break;
+ }
+
/* if the prio of this runqueue changed, try again */
if (double_lock_balance(rq, lowest_rq)) {
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
- int skip_clock_update;
-
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
+ unsigned int clock_skip_update;
u64 clock;
u64 clock_task;
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return ACCESS_ONCE(rq->clock);
+}
+
static inline u64 rq_clock(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock_task;
}
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+ lockdep_assert_held(&rq->lock);
+ if (skip)
+ rq->clock_skip_update |= RQCF_REQ_SKIP;
+ else
+ rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
#ifdef CONFIG_NUMA
enum numa_topology_type {
NUMA_DIRECT,
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
+ get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
- if (preempt_count() == cnt)
+ if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ }
}
EXPORT_SYMBOL(__local_bh_disable_ip);
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
* in the task stack here.
*/
__do_softirq();
- rcu_note_context_switch();
local_irq_enable();
- cond_resched();
+ cond_resched_rcu_qs();
return;
}
local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index a8c9f5a7dda6..ea9c88109894 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
error = MPX_ENABLE_MANAGEMENT(me);
break;
case PR_MPX_DISABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
error = MPX_DISABLE_MANAGEMENT(me);
break;
default:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..88ea2d6e0031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_sysctl_handler,
- .extra1 = &zero,
},
#ifdef CONFIG_NUMA
{
@@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
- .extra1 = &zero,
},
#endif
{
@@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
- .extra1 = &zero,
},
#endif
{
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
{
struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
void *reply = genlmsg_data(genlhdr);
- int rc;
- rc = genlmsg_end(skb, reply);
- if (rc < 0) {
- nlmsg_free(skb);
- return rc;
- }
+ genlmsg_end(skb, reply);
return genlmsg_reply(skb, info);
}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
void *reply = genlmsg_data(genlhdr);
int rc, delcount = 0;
- rc = genlmsg_end(skb, reply);
- if (rc < 0) {
- nlmsg_free(skb);
- return;
- }
+ genlmsg_end(skb, reply);
rc = 0;
down_read(&listeners->sem);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
#include "tick-internal.h"
#include "timekeeping_internal.h"
-void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
- u64 start_tstamp)
-{
- tc->cc = cc;
- tc->cycle_last = cc->read(cc);
- tc->nsec = start_tstamp;
-}
-EXPORT_SYMBOL_GPL(timecounter_init);
-
-/**
- * timecounter_read_delta - get nanoseconds since last call of this function
- * @tc: Pointer to time counter
- *
- * When the underlying cycle counter runs over, this will be handled
- * correctly as long as it does not run over more than once between
- * calls.
- *
- * The first call to this function for a new time counter initializes
- * the time tracking and returns an undefined result.
- */
-static u64 timecounter_read_delta(struct timecounter *tc)
-{
- cycle_t cycle_now, cycle_delta;
- u64 ns_offset;
-
- /* read cycle counter: */
- cycle_now = tc->cc->read(tc->cc);
-
- /* calculate the delta since the last timecounter_read_delta(): */
- cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
-
- /* convert to nanoseconds: */
- ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
-
- /* update time stamp of timecounter_read_delta() call: */
- tc->cycle_last = cycle_now;
-
- return ns_offset;
-}
-
-u64 timecounter_read(struct timecounter *tc)
-{
- u64 nsec;
-
- /* increment time by nanoseconds since last call */
- nsec = timecounter_read_delta(tc);
- nsec += tc->nsec;
- tc->nsec = nsec;
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_read);
-
-u64 timecounter_cyc2time(struct timecounter *tc,
- cycle_t cycle_tstamp)
-{
- u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
- u64 nsec;
-
- /*
- * Instead of always treating cycle_tstamp as more recent
- * than tc->cycle_last, detect when it is too far in the
- * future and treat it as old time stamp instead.
- */
- if (cycle_delta > tc->cc->mask / 2) {
- cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
- nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
- } else {
- nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
- }
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
-
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
* @mult: pointer to mult variable
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..3f5e183c3d97 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
boot = ktime_add(mono, off_boot);
xtim = ktime_add(mono, off_real);
- tai = ktime_add(xtim, off_tai);
+ tai = ktime_add(mono, off_tai);
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
/*
* Divide a ktime value by a nanosecond value
*/
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
{
u64 dclc;
int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
return dclc;
}
-EXPORT_SYMBOL_GPL(ktime_divns);
+EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
/*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
trace_hrtimer_cancel(timer);
}
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+ int i;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct timerqueue_node *next;
+ struct hrtimer *timer;
+
+ next = timerqueue_getnext(&base->active);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ if (expires.tv64 < expires_next.tv64)
+ expires_next = expires;
+ }
+ /*
+ * clock_was_set() might have changed base->offset of any of
+ * the clock bases so the result might be negative. Fix it up
+ * to prevent a false positive in clockevents_program_event().
+ */
+ if (expires_next.tv64 < 0)
+ expires_next.tv64 = 0;
+ return expires_next;
+}
+#endif
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- int i;
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t expires, expires_next;
-
- expires_next.tv64 = KTIME_MAX;
-
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
- struct hrtimer *timer;
- struct timerqueue_node *next;
-
- next = timerqueue_getnext(&base->active);
- if (!next)
- continue;
- timer = container_of(next, struct hrtimer, node);
-
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- /*
- * clock_was_set() has changed base->offset so the
- * result might be negative. Fix it up to prevent a
- * false positive in clockevents_program_event()
- */
- if (expires.tv64 < 0)
- expires.tv64 = 0;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
- }
+ ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return 0;
/*
+ * When the target cpu of the timer is currently executing
+ * hrtimer_interrupt(), then we do not touch the clock event
+ * device. hrtimer_interrupt() will reevaluate all clock bases
+ * before reprogramming the device.
+ */
+ if (cpu_base->in_hrtirq)
+ return 0;
+
+ /*
* If a hang was detected in the last timer interrupt then we
* do not schedule a timer which is earlier than the expiry
* which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
ktime_t hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+ ktime_t mindelta = { .tv64 = KTIME_MAX };
unsigned long flags;
- int i;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!hrtimer_hres_active()) {
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
- struct hrtimer *timer;
- struct timerqueue_node *next;
-
- next = timerqueue_getnext(&base->active);
- if (!next)
- continue;
-
- timer = container_of(next, struct hrtimer, node);
- delta.tv64 = hrtimer_get_expires_tv64(timer);
- delta = ktime_sub(delta, base->get_time());
- if (delta.tv64 < mindelta.tv64)
- mindelta.tv64 = delta.tv64;
- }
- }
+ if (!hrtimer_hres_active())
+ mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
+ ktime_get());
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
raw_spin_lock(&cpu_base->lock);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
- expires_next.tv64 = KTIME_MAX;
+ cpu_base->in_hrtirq = 1;
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
-
- if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (expires.tv64 < 0)
- expires.tv64 = KTIME_MAX;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
break;
- }
__run_hrtimer(timer, &basenow);
}
}
-
+ /* Reevaluate the clock bases for the next expiry */
+ expires_next = __hrtimer_get_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
+ cpu_base->in_hrtirq = 0;
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..4b585e0fdd22 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
getnstimeofday64(&now);
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
- struct timespec adjust = timespec64_to_timespec(now);
+ struct timespec64 adjust = now;
fail = -ENODEV;
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock(adjust);
+ fail = update_persistent_clock(timespec64_to_timespec(adjust));
#endif
#ifdef CONFIG_RTC_SYSTOHC
if (fail == -ENODEV)
@@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc)
if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
return -EPERM;
+ if (txc->modes & ADJ_FREQUENCY) {
+ if (LONG_MIN / PPM_SCALE > txc->freq)
+ return -EINVAL;
+ if (LONG_MAX / PPM_SCALE < txc->freq)
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4d54b7540585..1363d58f07e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -847,7 +847,6 @@ void tick_nohz_idle_enter(void)
local_irq_enable();
}
-EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -974,7 +973,6 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
-EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6390517e77d4..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
if (tv) {
if (copy_from_user(&user_tv, tv, sizeof(*tv)))
return -EFAULT;
+
+ if (!timeval_valid(&user_tv))
+ return -EINVAL;
+
new_ts.tv_sec = user_tv.tv_sec;
new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/timecounter.h>
+
+void timecounter_init(struct timecounter *tc,
+ const struct cyclecounter *cc,
+ u64 start_tstamp)
+{
+ tc->cc = cc;
+ tc->cycle_last = cc->read(cc);
+ tc->nsec = start_tstamp;
+ tc->mask = (1ULL << cc->shift) - 1;
+ tc->frac = 0;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc: Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+ cycle_t cycle_now, cycle_delta;
+ u64 ns_offset;
+
+ /* read cycle counter: */
+ cycle_now = tc->cc->read(tc->cc);
+
+ /* calculate the delta since the last timecounter_read_delta(): */
+ cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+ tc->mask, &tc->frac);
+
+ /* update time stamp of timecounter_read_delta() call: */
+ tc->cycle_last = cycle_now;
+
+ return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+ u64 nsec;
+
+ /* increment time by nanoseconds since last call */
+ nsec = timecounter_read_delta(tc);
+ nsec += tc->nsec;
+ tc->nsec = nsec;
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+ cycle_t cycles, u64 mask, u64 frac)
+{
+ u64 ns = (u64) cycles;
+
+ ns = ((ns * cc->mult) - frac) >> cc->shift;
+
+ return ns;
+}
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+ cycle_t cycle_tstamp)
+{
+ u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+ u64 nsec = tc->nsec, frac = tc->frac;
+
+ /*
+ * Instead of always treating cycle_tstamp as more recent
+ * than tc->cycle_last, detect when it is too far in the
+ * future and treat it as old time stamp instead.
+ */
+ if (delta > tc->cc->mask / 2) {
+ delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+ nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
+ } else {
+ nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
+ }
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..b124af259800 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1659,24 +1659,24 @@ out:
}
/**
- * getboottime - Return the real time of system boot.
- * @ts: pointer to the timespec to be set
+ * getboottime64 - Return the real time of system boot.
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
*
* This is based on the wall_to_monotonic offset and the total suspend
* time. Calls to settimeofday will affect the value returned (which
* basically means that however wrong your real time clock is at boot time,
* you get the right time here).
*/
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
- *ts = ktime_to_timespec(t);
+ *ts = ktime_to_timespec64(t);
}
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getboottime64);
unsigned long get_seconds(void)
{
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 9a7e0e60a1a8..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
-ifeq ($(CONFIG_PM_RUNTIME),y)
+ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
endif
ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 929a733d302e..224e768bdc73 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2497,12 +2497,14 @@ static void ftrace_run_update_code(int command)
}
static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
- struct ftrace_hash *old_hash)
+ struct ftrace_ops_hash *old_hash)
{
ops->flags |= FTRACE_OPS_FL_MODIFYING;
- ops->old_hash.filter_hash = old_hash;
+ ops->old_hash.filter_hash = old_hash->filter_hash;
+ ops->old_hash.notrace_hash = old_hash->notrace_hash;
ftrace_run_update_code(command);
ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
}
@@ -3579,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
+static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
{
int ret;
int i;
@@ -3637,6 +3639,7 @@ int
register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_probe *entry;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *old_hash = *orig_hash;
@@ -3658,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+ old_hash_ops.filter_hash = old_hash;
+ /* Probes only have filters */
+ old_hash_ops.notrace_hash = NULL;
+
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
if (!hash) {
count = -ENOMEM;
@@ -3718,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
- __enable_ftrace_function_probe(old_hash);
+ __enable_ftrace_function_probe(&old_hash_ops);
if (!ret)
free_ftrace_hash_rcu(old_hash);
@@ -4006,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
}
static void ftrace_ops_update_code(struct ftrace_ops *ops,
- struct ftrace_hash *old_hash)
+ struct ftrace_ops_hash *old_hash)
{
- if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
+ struct ftrace_ops *op;
+
+ if (!ftrace_enabled)
+ return;
+
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+ return;
+ }
+
+ /*
+ * If this is the shared global_ops filter, then we need to
+ * check if there is another ops that shares it, is enabled.
+ * If so, we still need to run the modify code.
+ */
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->func_hash == &global_ops.local_hash &&
+ op->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+ /* Only need to do this once */
+ return;
+ }
+ } while_for_each_ftrace_op(op);
}
static int
@@ -4017,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
int ret;
@@ -4053,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
mutex_lock(&ftrace_lock);
old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
if (!ret) {
- ftrace_ops_update_code(ops, old_hash);
+ ftrace_ops_update_code(ops, &old_hash_ops);
free_ftrace_hash_rcu(old_hash);
}
mutex_unlock(&ftrace_lock);
@@ -4267,6 +4301,7 @@ static void __init set_ftrace_early_filters(void)
int ftrace_regex_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_iterator *iter;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
@@ -4300,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
mutex_lock(&ftrace_lock);
old_hash = *orig_hash;
+ old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
if (!ret) {
- ftrace_ops_update_code(iter->ops, old_hash);
+ ftrace_ops_update_code(iter->ops, &old_hash_ops);
free_ftrace_hash_rcu(old_hash);
}
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2e767972e99c..4a9079b9f082 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6918,7 +6918,6 @@ void __init trace_init(void)
tracepoint_printk = 0;
}
tracer_alloc_buffers();
- init_ftrace_syscalls();
trace_event_init();
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
}
void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs *regs, int *rctxp)
+ struct pt_regs **regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
if (*rctxp < 0)
return NULL;
+ if (regs)
+ *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 366a78a3e61e..b03a0ea77b99 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2429,12 +2429,39 @@ static __init int event_trace_memsetup(void)
return 0;
}
+static __init void
+early_enable_events(struct trace_array *tr, bool disable_first)
+{
+ char *buf = bootup_event_buf;
+ char *token;
+ int ret;
+
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
+ continue;
+
+ /* Restarting syscalls requires that we stop them first */
+ if (disable_first)
+ ftrace_set_clr_event(tr, token, 0);
+
+ ret = ftrace_set_clr_event(tr, token, 1);
+ if (ret)
+ pr_warn("Failed to enable trace event: %s\n", token);
+
+ /* Put back the comma to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ',';
+ }
+}
+
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
struct ftrace_event_call **iter, *call;
- char *buf = bootup_event_buf;
- char *token;
int ret;
if (!tr)
@@ -2456,18 +2483,7 @@ static __init int event_trace_enable(void)
*/
__trace_early_add_events(tr);
- while (true) {
- token = strsep(&buf, ",");
-
- if (!token)
- break;
- if (!*token)
- continue;
-
- ret = ftrace_set_clr_event(tr, token, 1);
- if (ret)
- pr_warn("Failed to enable trace event: %s\n", token);
- }
+ early_enable_events(tr, false);
trace_printk_start_comm();
@@ -2478,6 +2494,31 @@ static __init int event_trace_enable(void)
return 0;
}
+/*
+ * event_trace_enable() is called from trace_event_init() first to
+ * initialize events and perhaps start any events that are on the
+ * command line. Unfortunately, there are some events that will not
+ * start this early, like the system call tracepoints that need
+ * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
+ * is called before pid 1 starts, and this flag is never set, making
+ * the syscall tracepoint never get reached, but the event is enabled
+ * regardless (and not doing anything).
+ */
+static __init int event_trace_enable_again(void)
+{
+ struct trace_array *tr;
+
+ tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
+
+ early_enable_events(tr, true);
+
+ return 0;
+}
+
+early_initcall(event_trace_enable_again);
+
static __init int event_trace_init(void)
{
struct trace_array *tr;
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index b0b1c44e923a..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -132,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
static __init int kdb_ftrace_register(void)
{
- kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
- "Dump ftrace log", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
return;
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
return;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size -= sizeof(u32);
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->event.type, regs, &rctx);
+ sys_data->enter_event->event.type, NULL, &rctx);
if (!rec)
return;
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size -= sizeof(u32);
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->event.type, regs, &rctx);
+ sys_data->exit_event->event.type, NULL, &rctx);
if (!rec)
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (hlist_empty(head))
goto out;
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
goto out;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6202b08f1933..beeeac9e0e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
*/
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
- if (!need_to_create_worker(pool))
- return false;
restart:
spin_unlock_irq(&pool->lock);
@@ -1877,7 +1871,6 @@ restart:
*/
if (need_to_create_worker(pool))
goto restart;
- return true;
}
/**
@@ -1897,16 +1890,14 @@ restart:
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
- * %false if the pool don't need management and the caller can safely start
- * processing works, %true indicates that the function released pool->lock
- * and reacquired it to perform some management function and that the
- * conditions that the caller verified while holding the lock before
- * calling the function might no longer be true.
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- bool ret = false;
/*
* Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
* actual management, the pool may stall indefinitely.
*/
if (!mutex_trylock(&pool->manager_arb))
- return ret;
+ return false;
- ret |= maybe_create_worker(pool);
+ maybe_create_worker(pool);
mutex_unlock(&pool->manager_arb);
- return ret;
+ return true;
}
/**