DDD 2009年11月11日 星期三 15:54 | 2807次浏览 | 0条评论
If the sched_clock() was realized based on jiffies, kgdb often
causes softlock warning messages on resuming or detaching from
a debug session.
A: BUG现象
*****:[cpu6] soft lockup - now [16002397s],touch_timestamp[16002307s]!***********************
BUG: soft lockup - CPU#6 stuck for 90s! [swapper:0]
Modules linked in: kgdboe
Cpu 6
$ 0 : 0000000000000000 ffffffff81590030 ffffffff81132400 0000000000000001
$ 4 : 0000000000000001 a800000031514000 ffffffff81596100 ffffffffffff00fe
$ 8 : 0000000000000000 a8000000315cfed0 0000000000000018 0000000000000001
$12 : 0000000000000000 0000000000008c00 a8000000315cc000 0000000000000000
$16 : ffffffff81597a20 0000000000000040 ffffffff81590000 96f513832dea2706
$20 : 2dea27065bd44e0c 5bd44e0cb7a89c19 b7a89c196f513832 6f513832dea27065
$24 : 0000000000000002 ffffffff81106e40
$28 : a800000031514000 a800000031517fc0 a89c196f513832de ffffffff81134b64
Hi : 0000000000000000
Lo : 0000000000000000
epc : ffffffff81132420 r4k_wait+0x20/0x40
Not tainted
ra : ffffffff81134b64 cpu_idle+0x7c/0xb8
Status: 10008ce3 KX SX UX KERNEL EXL IE
Cause : 40808000
PrId : 000d0408 (Cavium Octeon)
...
**************************************************************************************
B: BUG重现步骤&现场分析
1: "cpu sched clock" are "jiffies A" prior to a call to
kgdb_handle_exception()
2: Debugger waits in kgdb_handle_exception() for 80 seconds, on exit the
following is called touch_softlockup_watchdog().
3: The value of jiffies didn't be updated in kgdb due to the interrupts
were disabled. so the touch_timestamp of the softlockup_watchdog is
still "jiffies A" in the first timer interrupt after resuming from
kgdb_handle_exception.
4: jiffies was updated to "jiffies B" on the tick_do_timer_cpu.
so "jiffies B" = "jiffies A" + 80 seconds.
5: because ("jiffies B" - "jiffies A") >= 60 seconds, then it will trip
the softlockup warning.
C: BUG触发原因
In kgdb, the jiffies didn't be updated due to interrupt was disabled.
D: BUG解决方法
在退出kgdb的时候设置个标志位表明要更新jiffies.
由于系统中只能有一个cpu在更新jiffies,所以其它的CPU只能先关闭softlockup功能,在等待那个CPU更新完jiffies后,
再开启,具体实现请看patch.
E:Others
Patch:
---
include/linux/sched.h | 4 ++++
include/linux/tick.h | 4 ++++
kernel/kgdb.c | 15 +++++++++++++++
kernel/softlockup.c | 26 +++++++++++++++++++-------
kernel/time/tick-common.c | 5 +++++
kernel/time/tick-sched.c | 26 ++++++++++++++++++++++++++
6 files changed, 73 insertions(+), 7 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b42c488..57e2e2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -297,6 +297,7 @@ extern void softlockup_tick(void);
extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
+extern void softlockup_update_jiffies(void);
extern unsigned int softlockup_panic;
extern unsigned long sysctl_hung_task_check_count;
extern unsigned long sysctl_hung_task_timeout_secs;
@@ -318,6 +319,9 @@ static inline void touch_softlockup_watchdog_sync(void)
static inline void touch_all_softlockup_watchdogs(void)
{
}
+static inline void softlockup_update_jiffies(void)
+{
+}
#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 8cf8cfe..93c9ff7 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -69,6 +69,7 @@ struct tick_sched {
extern void __init tick_init(void);
extern int tick_is_oneshot_available(void);
extern struct tick_device *tick_get_device(int cpu);
+extern int get_tick_do_timer_cpu(void);
# ifdef CONFIG_HIGH_RES_TIMERS
extern int tick_init_highres(void);
@@ -96,9 +97,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
extern void tick_clock_notify(void);
extern int tick_check_oneshot_change(int allow_nohz);
extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern int tick_update_jiffies(void);
# else
static inline void tick_clock_notify(void) { }
static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline int tick_update_jiffies(void) { return 0; }
# endif
#else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,6 +109,7 @@ static inline void tick_init(void) { }
static inline void tick_cancel_sched_timer(int cpu) { }
static inline void tick_clock_notify(void) { }
static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline int get_tick_do_timer_cpu(void) { return 0; }
#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
# ifdef CONFIG_NO_HZ
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 235c3ff..bbe49bb 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -48,6 +48,7 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/tick.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -1565,6 +1566,12 @@ acquirelock:
atomic_set(&cpu_in_kgdb[ks->cpu], 0);
if (!kgdb_single_step) {
+ /*
+ * Set update jiffy flags before releasing
+ * the others cpu.
+ */
+ softlockup_update_jiffies();
+
for (i = NR_CPUS-1; i >= 0; i--)
atomic_set(&passive_cpu_wait[i], 0);
/*
@@ -1585,6 +1592,14 @@ kgdb_restore:
else
kgdb_sstep_pid = 0;
}
+
+ /*
+ * update the jiffies value if the current cpu is the CPU
+ * which responsible for global tick when kgdb do single setp.
+ */
+ if (kgdb_single_step && get_tick_do_timer_cpu() == cpu)
+ softlockup_update_jiffies();
+
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
touch_softlockup_watchdog_sync();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8b24917..713f5e5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
#include <linux/lockdep.h>
#include <linux/notifier.h>
#include <linux/module.h>
+#include <linux/tick.h>
#include <asm/irq_regs.h>
@@ -79,11 +80,18 @@ void touch_softlockup_watchdog(void)
EXPORT_SYMBOL(touch_softlockup_watchdog);
static int softlock_touch_sync[NR_CPUS];
+atomic_t __read_mostly softlock_update_jiffies = ATOMIC_INIT(0);
+
+
+void softlockup_update_jiffies(void)
+{
+ atomic_inc(&softlock_update_jiffies);
+}
void touch_softlockup_watchdog_sync(void)
{
- softlock_touch_sync[raw_smp_processor_id()] = 1;
- __raw_get_cpu_var(touch_timestamp) = 0;
+ softlock_touch_sync[raw_smp_processor_id()] = 1;
+ __raw_get_cpu_var(touch_timestamp) = 0;
}
void touch_all_softlockup_watchdogs(void)
@@ -118,11 +126,15 @@ void softlockup_tick(void)
if (touch_timestamp == 0) {
if (unlikely(softlock_touch_sync[this_cpu])) {
- /*
- * If the time stamp was touched atomically
- * make sure the scheduler tick is up to date.
- */
- softlock_touch_sync[this_cpu] = 0;
+
+ /* make sure the jiffies is up to date. */
+ if (unlikely(atomic_read(&softlock_update_jiffies))) {
+ if (tick_update_jiffies())
+ return;
+ atomic_set(&softlock_update_jiffies, 0);
+ }
+
+ /* make sure the scheduler tick is up to date. */
sched_clock_tick();
}
__touch_softlockup_watchdog();
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bad22e2..60e9bee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -36,6 +36,11 @@ ktime_t tick_period;
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
DEFINE_SPINLOCK(tick_device_lock);
+int get_tick_do_timer_cpu(void)
+{
+ return tick_do_timer_cpu;
+}
+
/*
* Debugging: see timer_list.c
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb02324..0898427 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -83,6 +83,32 @@ static void tick_do_update_jiffies64(ktime_t now)
}
/*
+ * tick_update_jiffies() - update the global jiffies
+ *
+ * If current CPU is the CPU which responsible for global tick, then
+ * do update the jiffies value. or it will do nothing, and return 1.
+ */
+int tick_update_jiffies(void)
+{
+ unsigned long flags;
+ int cpu = smp_processor_id();
+
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ tick_do_timer_cpu = cpu;
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu != cpu)
+ return 1;
+
+ /* do update jiffies */
+ local_irq_save(flags);
+ tick_do_update_jiffies64(ktime_get());
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+/*
* Initialize and return retrieve the jiffies update.
*/
static ktime_t tick_init_jiffy_update(void)
--
1.6.0.4
Zeuux © 2024
京ICP备05028076号
暂时没有评论