hangcheck-timer.c (5208B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * hangcheck-timer.c 4 * 5 * Driver for a little io fencing timer. 6 * 7 * Copyright (C) 2002, 2003 Oracle. All rights reserved. 8 * 9 * Author: Joel Becker <joel.becker@oracle.com> 10 */ 11 12/* 13 * The hangcheck-timer driver uses the TSC to catch delays that 14 * jiffies does not notice. A timer is set. When the timer fires, it 15 * checks whether it was delayed and if that delay exceeds a given 16 * margin of error. The hangcheck_tick module parameter takes the timer 17 * duration in seconds. The hangcheck_margin parameter defines the 18 * margin of error, in seconds. The defaults are 60 seconds for the 19 * timer and 180 seconds for the margin of error. IOW, a timer is set 20 * for 60 seconds. When the timer fires, the callback checks the 21 * actual duration that the timer waited. If the duration exceeds the 22 * allotted time and margin (here 60 + 180, or 240 seconds), the machine 23 * is restarted. A healthy machine will have the duration match the 24 * expected timeout very closely. 25 */ 26 27#include <linux/module.h> 28#include <linux/moduleparam.h> 29#include <linux/types.h> 30#include <linux/kernel.h> 31#include <linux/fs.h> 32#include <linux/mm.h> 33#include <linux/reboot.h> 34#include <linux/init.h> 35#include <linux/delay.h> 36#include <linux/uaccess.h> 37#include <linux/sysrq.h> 38#include <linux/timer.h> 39#include <linux/hrtimer.h> 40 41#define VERSION_STR "0.9.1" 42 43#define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 44#define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 45 46static int hangcheck_tick = DEFAULT_IOFENCE_TICK; 47static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 48static int hangcheck_reboot; /* Defaults to not reboot */ 49static int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 50 51/* options - modular */ 52module_param(hangcheck_tick, int, 0); 53MODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 54module_param(hangcheck_margin, int, 0); 55MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 56module_param(hangcheck_reboot, int, 0); 57MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 58module_param(hangcheck_dump_tasks, int, 0); 59MODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 60 61MODULE_AUTHOR("Oracle"); 62MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 63MODULE_LICENSE("GPL"); 64MODULE_VERSION(VERSION_STR); 65 66/* options - nonmodular */ 67#ifndef MODULE 68 69static int __init hangcheck_parse_tick(char *str) 70{ 71 int par; 72 if (get_option(&str,&par)) 73 hangcheck_tick = par; 74 return 1; 75} 76 77static int __init hangcheck_parse_margin(char *str) 78{ 79 int par; 80 if (get_option(&str,&par)) 81 hangcheck_margin = par; 82 return 1; 83} 84 85static int __init hangcheck_parse_reboot(char *str) 86{ 87 int par; 88 if (get_option(&str,&par)) 89 hangcheck_reboot = par; 90 return 1; 91} 92 93static int __init hangcheck_parse_dump_tasks(char *str) 94{ 95 int par; 96 if (get_option(&str,&par)) 97 hangcheck_dump_tasks = par; 98 return 1; 99} 100 101__setup("hcheck_tick", hangcheck_parse_tick); 102__setup("hcheck_margin", hangcheck_parse_margin); 103__setup("hcheck_reboot", hangcheck_parse_reboot); 104__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 105#endif /* not MODULE */ 106 107#define TIMER_FREQ 1000000000ULL 108 109/* Last time scheduled */ 110static unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 111 112static void hangcheck_fire(struct timer_list *); 113 114static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 115 116static void hangcheck_fire(struct timer_list *unused) 117{ 118 unsigned long long cur_tsc, tsc_diff; 119 120 cur_tsc = ktime_get_ns(); 121 122 if (cur_tsc > hangcheck_tsc) 123 tsc_diff = cur_tsc - hangcheck_tsc; 124 else 125 tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 126 127 if (tsc_diff > hangcheck_tsc_margin) { 128 if (hangcheck_dump_tasks) { 129 printk(KERN_CRIT "Hangcheck: Task state:\n"); 130#ifdef CONFIG_MAGIC_SYSRQ 131 handle_sysrq('t'); 132#endif /* CONFIG_MAGIC_SYSRQ */ 133 } 134 if (hangcheck_reboot) { 135 printk(KERN_CRIT "Hangcheck: hangcheck is restarting the machine.\n"); 136 emergency_restart(); 137 } else { 138 printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n"); 139 } 140 } 141#if 0 142 /* 143 * Enable to investigate delays in detail 144 */ 145 printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n", 146 tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 147#endif 148 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 149 hangcheck_tsc = ktime_get_ns(); 150} 151 152 153static int __init hangcheck_init(void) 154{ 155 printk("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 156 VERSION_STR, hangcheck_tick, hangcheck_margin); 157 hangcheck_tsc_margin = 158 (unsigned long long)hangcheck_margin + hangcheck_tick; 159 hangcheck_tsc_margin *= TIMER_FREQ; 160 161 hangcheck_tsc = ktime_get_ns(); 162 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 163 164 return 0; 165} 166 167 168static void __exit hangcheck_exit(void) 169{ 170 del_timer_sync(&hangcheck_ticktock); 171 printk("Hangcheck: Stopped hangcheck timer.\n"); 172} 173 174module_init(hangcheck_init); 175module_exit(hangcheck_exit);