
Android系统运行以后,System_server中可能有成百上千个线程在运行,各种服务之间调用很频繁,也很复杂,难免会出现死锁和长时间未响应的问题。这个问题对于系统来说是非常严重的,因为一旦出现这种情况,会导致一系列的并发症,最终会导致界面卡死,手机耗电急剧上升,发热严重。当然,我们要做的第一步是尽量避免此情况的发生,这种需要大量的测试和实践,Android系统现在已经做的很不错了,但是也要考虑一旦出现这种情况,系统对此的处理。本文主要来回顾下framework层 Watchdog、anr检测、处理相关的知识。
Watchdog检测原理 watchdog主要对系统重要的服务进行检测和处理,下来从源码的角度来分析它如何实现的。watchdog首先本身是一个线程,继承于Thread,在system_server初始化的过程中启动。

private Watchdog() { super("watchdog"); // The shared foreground thread is the main checker.It is where we // will also dispatch monitor checks and do other work. mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread", DEFAULT_TIMEOUT); mHandlerCheckers.add(mMonitorChecker); // Add checker for main thread.We only do a quick check since there // can be UI running on the thread. mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread", DEFAULT_TIMEOUT)); // Add checker for shared UI thread. mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread", DEFAULT_TIMEOUT)); // And also check IO thread. mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread", DEFAULT_TIMEOUT)); // And the display thread. mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), "display thread", DEFAULT_TIMEOUT)); }

首先,在它初始化过程中,将几个重要的线程添加到mHandlerCheckers中,这些线程全都是事件驱动线程,继承于HandlerThread,而HandlerChecker本身是个Runnable对象。前台线程也是最主要的检测者,外界服务添加monitor check都是添加到mMonitorChecker中。
public void addMonitor(Monitor monitor) { synchronized (this) { if (isAlive()) { throw new RuntimeException("Monitors can't be added once the Watchdog is running"); } mMonitorChecker.addMonitor(monitor); } }

@Override public void run() { boolean waitedHalf = false; while (true) { final ArrayList blockedCheckers; final String subject; final boolean allowRestart; //可动态设置,当发生死锁,系统是否需要重启 int debuggerWasConnected = 0; synchronized (this) { long timeout = CHECK_INTERVAL; // 30s //会调用每个线程对应的HandlerCheckers的scheduleCheckLocked方法 //HandlerChecker中又持有该线程Handler引用,Handler又能获取到Looper for (int i=0; i 0) { debuggerWasConnected--; }//记录开始时间 long start = SystemClock.uptimeMillis(); while (timeout > 0) { if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } try { wait(timeout); //等待30s } catch (InterruptedException e) {, e); } if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); }//这个方法稍后分析,waitState 是执行完获取HandlerCheck检测结果 final int waitState = evaluateCheckerCompletionLocked(); if (waitState == COMPLETED) { //代表没有死锁的发生,重新开始 // The monitors have returned; reset waitedHalf = false; continue; } else if (waitState == WAITING) {//还是等待中 // still waiting but within their configured intervals; back off and recheck continue; } else if (waitState == WAITED_HALF) { //如果30s内HandleCheck未执行完,则打印native进程状态 if (!waitedHalf) { // We've waited half the deadlock-detection interval.Pull a stack // trace and wait another half. ArrayList pids = new ArrayList(); pids.add(Process.myPid()); ActivityManagerService.dumpStackTraces(true, pids, null, null, NATIVE_STACKS_OF_INTEREST); waitedHalf = true; } continue; }//如果1分钟还未执行完,则获取哪些HandlerChecker堵塞了。 blockedCheckers = getBlockedCheckersLocked(); //将堵塞详细信息打印出来 subject = describeCheckersLocked(blockedCheckers); allowRestart = mAllowRestart; }//记录到EventLog中 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); ArrayList pids = new ArrayList(); pids.add(Process.myPid()); if (mPhonePid > 0) pids.add(mPhonePid); //打印核心native进程堆栈信息 final File stack = ActivityManagerService.dumpStackTraces( !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); //等待两秒 SystemClock.sleep(2000); //打印kernel线程执行堆栈信息 if (RECORD_KERNEL_THREADS) { dumpKernelStackTraces(); }//触发kernel打印所有堵塞线程调用栈信息 try { FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); sysrq_trigger.write("w"); sysrq_trigger.close(); } catch (IOException e) { Slog.e(TAG, "Failed to write to /proc/sysrq-trigger"); Slog.e(TAG, e.getMessage()); }//给两秒时间记录到 dropbox中 (data/system/dropbox) Thread dropboxThread = new Thread("watchdogWriteToDropbox") { public void run() { mActivity.addErrorToDropBox( "watchdog", null, "system_server", null, null, subject, null, stack, null); } }; dropboxThread.start(); try { dropboxThread.join(2000); // wait up to 2 seconds for it to return. } catch (InterruptedException ignored) {}//...//这里在调试模式中和当allowRestart为false的情况下,不允许杀死进程 if (debuggerWasConnected >= 2) { Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); } else if (debuggerWasConnected > 0) { Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); } else if (!allowRestart) { Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); } else { Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); for (int i=0; i

  1. 执行HandlerChecker中的scheduleCheckLocked方法,通过handler引用的looper对象,将自己丢入对应线程的消息队列中,执行死锁检测。
  2. while循环中,每过30s会查看下HandlerChecker的检测结果,如果没有发生堵塞,则从新开始,如果堵塞了,则进入第三步。
  3. 将堵塞线程调用堆栈打印出来,搜集各类日志,包括kernel堵塞线程堆栈,核心native进程 dump信息,并持久化,最后杀死自己,让init进程重启自己。
public void scheduleCheckLocked() { if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) { mCompleted = true; return; }if (!mCompleted) { // we already have a check in flight, so no need return; }mCompleted = false; mCurrentMonitor = null; mStartTime = SystemClock.uptimeMillis(); //将自己丢入MessageQueue中 mHandler.postAtFrontOfQueue(this); }//当线程执行到这个消息的时候,进来 @Override public void run() { final int size = mMonitors.size(); for (int i = 0 ; i < size ; i++) { synchronized (Watchdog.this) { mCurrentMonitor = mMonitors.get(i); } //其实就是执行每个Monitor.monitor方法 mCurrentMonitor.monitor(); }//如果没有发生堵塞,则完成检测,否则就卡在上面了。 synchronized (Watchdog.this) { mCompleted = true; mCurrentMonitor = null; } }//下面是检测AMS的例子,其他每个服务都是如此实现得。 public final class ActivityManagerService extends ActivityManagerNative implements Watchdog.Monitor, BatteryStatsImpl.BatteryCallback { ... //如果发生死锁,则无法获取到锁对象,注意外界调用AMS的方法,同步都是使用AMS实例这把“锁” public void monitor() { synchronized (this) { } } ... }

private ArrayList getBlockedCheckersLocked() { ArrayList checkers = new ArrayList(); for (int i=0; i mStartTime + mWaitMax); }

ANR检测机制和处理 首先来看看Android系统在哪些情况会触发anr:
  • 前台服务20s内未执行完成
  • 前台广播10s内未执行完成,后台广播20s内未执行完成
  • 内容提供者执行publishProvider,超时10s
  • 输入事件超时5s
private final class BroadcastHandler extends Handler { public BroadcastHandler(Looper looper) { super(looper, null, true); }@Override public void handleMessage(Message msg) { switch (msg.what) { //接受intent处理 case BROADCAST_INTENT_MSG: { if (DEBUG_BROADCAST) Slog.v( TAG, "Received BROADCAST_INTENT_MSG"); processNextBroadcast(true); } break; //消息超时处理 case BROADCAST_TIMEOUT_MSG: { synchronized (mService) { broadcastTimeoutLocked(true); } } break; } } }; //获取下一条广播 int recIdx = r.nextReceiver++; //记录当时时间 r.receiverTime = SystemClock.uptimeMillis(); if (recIdx == 0) { r.dispatchTime = r.receiverTime; r.dispatchClockTime = System.currentTimeMillis(); if (DEBUG_BROADCAST_LIGHT) Slog.v(TAG, "Processing ordered broadcast [" + mQueueName + "] " + r); } if (! mPendingBroadcastTimeoutMessage) { long timeoutTime = r.receiverTime + mTimeoutPeriod; if (DEBUG_BROADCAST) Slog.v(TAG, "Submitting BROADCAST_TIMEOUT_MSG [" + mQueueName + "] for " + r + " at " + timeoutTime); //向消息队列中丢向Anr触发的延时消息 setBroadcastTimeoutLocked(timeoutTime); }final void setBroadcastTimeoutLocked(long timeoutTime) { if (! mPendingBroadcastTimeoutMessage) { Message msg = mHandler.obtainMessage(BROADCAST_TIMEOUT_MSG, this); mHandler.sendMessageAtTime(msg, timeoutTime); mPendingBroadcastTimeoutMessage = true; } }

if (r.receivers == null || r.nextReceiver >= numReceivers || r.resultAbort || forceReceive) { // No more receivers for this broadcast!Send the final // result if requested... if (r.resultTo != null) { try { if (DEBUG_BROADCAST) { int seq = r.intent.getIntExtra("seq", -1); Slog.i(TAG, "Finishing broadcast [" + mQueueName + "] " + r.intent.getAction() + " seq=" + seq + " app=" + r.callerApp); } //处理事件 performReceiveLocked(r.callerApp, r.resultTo, new Intent(r.intent), r.resultCode, r.resultData, r.resultExtras, false, false, r.userId); // Set this to null so that the reference // (local and remote) isn't kept in the mBroadcastHistory. r.resultTo = null; } catch (RemoteException e) { r.resultTo = null; Slog.w(TAG, "Failure [" + mQueueName + "] sending broadcast result of " + r.intent, e); } }//处理完事件,取消消息 cancelBroadcastTimeoutLocked(); // ... and on to the next... addBroadcastToHistoryLocked(r); mOrderedBroadcasts.remove(0); r = null; looped = true; continue; } } while (r == null); //移除消息 final void cancelBroadcastTimeoutLocked() { if (mPendingBroadcastTimeoutMessage) { mHandler.removeMessages(BROADCAST_TIMEOUT_MSG, this); mPendingBroadcastTimeoutMessage = false; } }

//最终会调用到AMS的 appNotResponding 方法中处理anr final void appNotResponding(ProcessRecord app, ActivityRecord activity, ActivityRecord parent, boolean aboveSystem, final String annotation) { ArrayList firstPids = new ArrayList(5); SparseArray lastPids = new SparseArray(20); //从代码表面意思看,如果res是-1则立即杀死应用,0的话会继续操作 if (mController != null) { try { // 0 == continue, -1 = kill process immediately int res = mController.appEarlyNotResponding(app.processName,, annotation); if (res < 0 && != MY_PID) { app.kill("anr", true); } } catch (RemoteException e) { mController = null; Watchdog.getInstance().setActivityController(null); } } //打印当前cpu操作 long anrTime = SystemClock.uptimeMillis(); if (MONITOR_CPU_USAGE) { updateCpuStatsNow(); }synchronized (this) { // PowerManager.reboot() can block for a long time, so ignore ANRs while shutting down. if (mShuttingDown) { Slog.i(TAG, "During shutdown skipping ANR: " + app + " " + annotation); return; } else if (app.notResponding) { Slog.i(TAG, "Skipping duplicate ANR: " + app + " " + annotation); return; } else if (app.crashing) { Slog.i(TAG, "Crashing app skipping ANR: " + app + " " + annotation); return; }// In case we come through here for the same app before completing // this one, mark as anring now so we will bail out. app.notResponding = true; //记录到事件日志中去 EventLog.writeEvent(EventLogTags.AM_ANR, app.userId,, app.processName,, annotation); //收集firstPids进程的stacks //第一个是发生anr进程,第二个是system_server,其余的是mLruProcesses所有 //persistent进程 firstPids.add(; int parentPid =; if (parent != null && != null && > 0) parentPid =; if (parentPid != firstPids.add(parentPid); if (MY_PID != && MY_PID != parentPid) firstPids.add(MY_PID); for (int i = mLruProcesses.size() - 1; i >= 0; i--) { ProcessRecord r = mLruProcesses.get(i); if (r != null && r.thread != null) { int pid =; if (pid > 0 && pid != && pid != parentPid && pid != MY_PID) { if (r.persistent) { firstPids.add(pid); } else { lastPids.put(pid, Boolean.TRUE); } } } } }//记录日志 StringBuilder info = new StringBuilder(); info.setLength(0); info.append("ANR in ").append(app.processName); if (activity != null && activity.shortComponentName != null) { info.append(" (").append(activity.shortComponentName).append(")"); } info.append("\n"); info.append("PID: ").append("\n"); if (annotation != null) { info.append("Reason: ").append(annotation).append("\n"); } if (parent != null && parent != activity) { info.append("Parent: ").append(parent.shortComponentName).append("\n"); }//打印cpu的状态信息 final ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true); //生成traces文件 File tracesFile = dumpStackTraces(true, firstPids, processCpuTracker, lastPids, NATIVE_STACKS_OF_INTEREST); String cpuInfo = null; if (MONITOR_CPU_USAGE) { updateCpuStatsNow(); synchronized (mProcessCpuTracker) { cpuInfo = mProcessCpuTracker.printCurrentState(anrTime); } info.append(processCpuTracker.printCurrentLoad()); info.append(cpuInfo); }info.append(processCpuTracker.printCurrentState(anrTime)); Slog.e(TAG, info.toString()); if (tracesFile == null) { //如果没有生成traces文件,则系统会发成 signal为3的信号 Process.sendSignal(, Process.SIGNAL_QUIT); }//添加到dropbox目录下 addErrorToDropBox("anr", app, app.processName, activity, parent, annotation, cpuInfo, tracesFile, null); if (mController != null) { try { // 0 == show dialog, 1 = keep waiting, -1 = kill process immediately int res = mController.appNotResponding(app.processName,, info.toString()); if (res != 0) { if (res < 0 && != MY_PID) { app.kill("anr", true); } else { synchronized (this) { mServices.scheduleServiceTimeoutLocked(app); } } return; } } catch (RemoteException e) { mController = null; Watchdog.getInstance().setActivityController(null); } }//根据进程的类型判断是直接杀死还是通知用户anrboolean showBackground = Settings.Secure.getInt(mContext.getContentResolver(), Settings.Secure.ANR_SHOW_BACKGROUND, 0) != 0; synchronized (this) { if (!showBackground && !app.isInterestingToUserLocked() && != MY_PID) { app.kill("bg anr", true); return; }// Set the app's notResponding state, and look up the errorReportReceiver makeAppNotRespondingLocked(app, activity != null ? activity.shortComponentName : null, annotation != null ? "ANR " + annotation : "ANR", info.toString()); // Bring up the infamous App Not Responding dialog Message msg = Message.obtain(); HashMap map = new HashMap(); msg.what = SHOW_NOT_RESPONDING_MSG; msg.obj = map; msg.arg1 = aboveSystem ? 1 : 0; map.put("app", app); if (activity != null) { map.put("activity", activity); }mHandler.sendMessage(msg); } }

总结 上文主要分析了系统对核心服务死锁问题的检测和处理,以及系统对应用层在某些情景下操作耗时过多从而触发anr,以及处理。那么我们从中学到了以下知识点:
  1. 如何在代码中检测线程死锁问题。
  2. 对线程执行某些事件判断是否超时的方法。
  3. /data/anr/traces.txt文件和dropbox目录下文件记录了很详细的日志
  • jstack 查看java进程的堆栈状态,查看每个线程的运行状态,排查死锁问题
  • top 查看进程/线程所占cpu,内存大小
  • meminfo 查看进程内存占用情况,Android 特定对象存活数量等
  • traceView 查看线程耗时情况和cpu占用率
