抓虫日记之 kgdb 与 softlockup(2)

DDD  2009年11月11日 星期三 15:54 | 2807次浏览 | 0条评论

If the sched_clock() was realized based on jiffies, kgdb often
causes softlock warning messages on resuming or detaching from
a debug session.

A: BUG现象
*****:[cpu6] soft lockup - now [16002397s],touch_timestamp[16002307s]!***********************
BUG: soft lockup - CPU#6 stuck for 90s! [swapper:0]
Modules linked in: kgdboe
Cpu 6
$ 0   : 0000000000000000 ffffffff81590030 ffffffff81132400 0000000000000001
$ 4   : 0000000000000001 a800000031514000 ffffffff81596100 ffffffffffff00fe
$ 8   : 0000000000000000 a8000000315cfed0 0000000000000018 0000000000000001
$12   : 0000000000000000 0000000000008c00 a8000000315cc000 0000000000000000
$16   : ffffffff81597a20 0000000000000040 ffffffff81590000 96f513832dea2706
$20   : 2dea27065bd44e0c 5bd44e0cb7a89c19 b7a89c196f513832 6f513832dea27065
$24   : 0000000000000002 ffffffff81106e40                                 
$28   : a800000031514000 a800000031517fc0 a89c196f513832de ffffffff81134b64
Hi    : 0000000000000000
Lo    : 0000000000000000
epc   : ffffffff81132420 r4k_wait+0x20/0x40
    Not tainted
ra    : ffffffff81134b64 cpu_idle+0x7c/0xb8
Status: 10008ce3    KX SX UX KERNEL EXL IE
Cause : 40808000
PrId  : 000d0408 (Cavium Octeon)
...

**************************************************************************************


B: BUG重现步骤&现场分析
1: "cpu sched clock" are "jiffies A" prior to a call to
kgdb_handle_exception()

2: Debugger waits in kgdb_handle_exception() for 80 seconds, on exit the
following is called touch_softlockup_watchdog().

3: The value of jiffies didn't be updated in kgdb due to the interrupts
were disabled. so the touch_timestamp of the softlockup_watchdog is
still "jiffies A" in the first timer interrupt after resuming from
kgdb_handle_exception.

4: jiffies was updated to "jiffies B" on the tick_do_timer_cpu.
so "jiffies B" = "jiffies A" + 80 seconds.

5: because ("jiffies B" - "jiffies A") >= 60 seconds, then it will trip
 the softlockup warning.

C: BUG触发原因

In kgdb, the jiffies didn't be updated due to interrupt was disabled.

D: BUG解决方法
在退出kgdb的时候设置个标志位表明要更新jiffies.
由于系统中只能有一个cpu在更新jiffies,所以其它的CPU只能先关闭softlockup功能,在等待那个CPU更新完jiffies后,
再开启,具体实现请看patch.

E:Others

Patch:
---
 include/linux/sched.h     |    4 ++++
 include/linux/tick.h      |    4 ++++
 kernel/kgdb.c             |   15 +++++++++++++++
 kernel/softlockup.c       |   26 +++++++++++++++++++-------
 kernel/time/tick-common.c |    5 +++++
 kernel/time/tick-sched.c  |   26 ++++++++++++++++++++++++++
 6 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b42c488..57e2e2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -297,6 +297,7 @@ extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
+extern void softlockup_update_jiffies(void);
 extern unsigned int  softlockup_panic;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
@@ -318,6 +319,9 @@ static inline void touch_softlockup_watchdog_sync(void)
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
+static inline void softlockup_update_jiffies(void)
+{
+}
 #endif
 
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 8cf8cfe..93c9ff7 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -69,6 +69,7 @@ struct tick_sched {
 extern void __init tick_init(void);
 extern int tick_is_oneshot_available(void);
 extern struct tick_device *tick_get_device(int cpu);
+extern int get_tick_do_timer_cpu(void);
 
 # ifdef CONFIG_HIGH_RES_TIMERS
 extern int tick_init_highres(void);
@@ -96,9 +97,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
 extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern int tick_update_jiffies(void);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline int tick_update_jiffies(void) { return 0; }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,6 +109,7 @@ static inline void tick_init(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline int get_tick_do_timer_cpu(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 235c3ff..bbe49bb 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -48,6 +48,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/tick.h>
 
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
@@ -1565,6 +1566,12 @@ acquirelock:
        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
 
        if (!kgdb_single_step) {
+               /*
+                * Set update jiffy flags before releasing
+                * the others cpu.
+                */
+               softlockup_update_jiffies();
+
                for (i = NR_CPUS-1; i >= 0; i--)
                        atomic_set(&passive_cpu_wait[i], 0);
                /*
@@ -1585,6 +1592,14 @@ kgdb_restore:
                else
                        kgdb_sstep_pid = 0;
        }
+
+       /*
+        * update the jiffies value if the current cpu is the CPU
+        * which responsible for global tick when kgdb do single setp.
+        */
+       if (kgdb_single_step && get_tick_do_timer_cpu() == cpu)
+               softlockup_update_jiffies();
+
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
        touch_softlockup_watchdog_sync();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8b24917..713f5e5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
 #include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
+#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 
@@ -79,11 +80,18 @@ void touch_softlockup_watchdog(void)
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 static int softlock_touch_sync[NR_CPUS];
+atomic_t __read_mostly softlock_update_jiffies = ATOMIC_INIT(0);
+
+
+void softlockup_update_jiffies(void)
+{
+       atomic_inc(&softlock_update_jiffies);
+}
 
 void touch_softlockup_watchdog_sync(void)
 {
-       softlock_touch_sync[raw_smp_processor_id()] = 1;
-       __raw_get_cpu_var(touch_timestamp) = 0;
+       softlock_touch_sync[raw_smp_processor_id()] = 1;
+       __raw_get_cpu_var(touch_timestamp) = 0;
 }
 
 void touch_all_softlockup_watchdogs(void)
@@ -118,11 +126,15 @@ void softlockup_tick(void)
 
        if (touch_timestamp == 0) {
                if (unlikely(softlock_touch_sync[this_cpu])) {
-                       /*
-                        * If the time stamp was touched atomically
-                        * make sure the scheduler tick is up to date.
-                        */
-                       softlock_touch_sync[this_cpu] = 0;
+
+                       /* make sure the jiffies is up to date. */
+                       if (unlikely(atomic_read(&softlock_update_jiffies))) {
+                               if (tick_update_jiffies())
+                                       return;
+                               atomic_set(&softlock_update_jiffies, 0);
+                       }
+
+                       /* make sure the scheduler tick is up to date. */
                        sched_clock_tick();
                }
                __touch_softlockup_watchdog();
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bad22e2..60e9bee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -36,6 +36,11 @@ ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 DEFINE_SPINLOCK(tick_device_lock);
 
+int get_tick_do_timer_cpu(void)
+{
+       return tick_do_timer_cpu;
+}
+
 /*
  * Debugging: see timer_list.c
  */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb02324..0898427 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -83,6 +83,32 @@ static void tick_do_update_jiffies64(ktime_t now)
 }
 
 /*
+ * tick_update_jiffies() - update the global jiffies
+ *
+ * If current CPU is the CPU which responsible for global tick, then
+ * do update the jiffies value. or it will do nothing, and return 1.
+ */
+int tick_update_jiffies(void)
+{
+       unsigned long flags;
+       int cpu = smp_processor_id();
+
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+               tick_do_timer_cpu = cpu;
+
+       /* Check, if the jiffies need an update */
+       if (tick_do_timer_cpu != cpu)
+               return 1;
+
+       /* do update jiffies */
+       local_irq_save(flags);
+       tick_do_update_jiffies64(ktime_get());
+       local_irq_restore(flags);
+
+       return 0;
+}
+
+/*
  * Initialize and return retrieve the jiffies update.
  */
 static ktime_t tick_init_jiffy_update(void)
--
1.6.0.4

评论

我的评论:

发表评论

请 登录 后发表评论。还没有在Zeuux哲思注册吗?现在 注册 !

暂时没有评论

Zeuux © 2024

京ICP备05028076号