plugins/af_readability/vendor/fivefilters/readability.php/test/test-pages/google-sre-book-1/source.html


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742

<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    <head>
        <meta charset="utf-8" />
        <meta content="initial-scale=1, minimum-scale=1, width=device-width" name="viewport" />
        <title>
            Google - Site Reliability Engineering
        </title>
        <meta name="referrer" content="no-referrer" />
        <link rel="apple-touch-icon-precomposed" sizes="180x180" href="https://lh3.googleusercontent.com/Yf2DCX8RKda6r4Jml9DLMByS2zQCBFs3kQpvBfN8UgIh4YVWIYSYIQOoTxJriyuM26cT5PDjyEb5aynDQ0Xyz46yHKnfg8JlUbDW" />
        <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Google+Sans:400|Roboto:400,400italic,500,500italic,700,700italic|Roboto+Mono:400,500,700|Material+Icons" />
        <link rel="icon" type="image/png" sizes="32x32" href="https://lh3.googleusercontent.com/Yf2DCX8RKda6r4Jml9DLMByS2zQCBFs3kQpvBfN8UgIh4YVWIYSYIQOoTxJriyuM26cT5PDjyEb5aynDQ0Xyz46yHKnfg8JlUbDW" />
        <link rel="icon" type="image/png" sizes="16x16" href="https://lh3.googleusercontent.com/Yf2DCX8RKda6r4Jml9DLMByS2zQCBFs3kQpvBfN8UgIh4YVWIYSYIQOoTxJriyuM26cT5PDjyEb5aynDQ0Xyz46yHKnfg8JlUbDW" />
        <link rel="shortcut icon" href="https://lh3.googleusercontent.com/Yf2DCX8RKda6r4Jml9DLMByS2zQCBFs3kQpvBfN8UgIh4YVWIYSYIQOoTxJriyuM26cT5PDjyEb5aynDQ0Xyz46yHKnfg8JlUbDW" />
        <link href="/sre/sre-book/static/css/index.min.css?cache=0ffc48d" rel="stylesheet" />
        <script>
        <![CDATA[
        (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
        (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
        m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
        })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

        ga('create', 'UA-75468017-1', 'auto');
        ga('send', 'pageview');
        ]]>
        </script>
        <script src="/sre/sre-book/static/js/detect.min.js?cache=4cb778b"></script>
    </head>
    <body>
        <main>
            <div ng-controller="HeaderCtrl as headerCtrl">
                <div id="curtain" class="menu-closed"></div>
                <div class="header clearfix">
                    <a id="burger-menu" class="expand"></a>
                    <h2 class="chapter-title">
                        Chapter 6 - Monitoring Distributed Systems
                    </h2>
                </div>
                <div id="overlay-element" class="expands">
                    <div class="logo">
                        <a href="https://www.google.com"><img src="https://lh3.googleusercontent.com/YoVRtLOHMSRYQZ3OhFL8RIamcjFYbmQXX4oAQx02MRqqY9zlKNvsuZpS73khXiOqTH3qrFW27VrERJJIHTjPk-tAh46q8-Fd4w6qlw" alt="Google" /></a>
                    </div>
                    <ol id="drop-down" class="dropdown-content hide">
                        <li>
                            <a class="menu-buttons" href="/sre/sre-book/toc/">Table of Contents</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/foreword" class="menu-buttons">Foreword</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/preface" class="menu-buttons">Preface</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/part1" class="menu-buttons">Part I - Introduction</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/introduction" class="menu-buttons">1. Introduction</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/production-environment" class="menu-buttons">2. The Production Environment at Google, from the Viewpoint of an SRE</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/part2" class="menu-buttons">Part II - Principles</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/embracing-risk" class="menu-buttons">3. Embracing Risk</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/service-level-objectives" class="menu-buttons">4. Service Level Objectives</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/eliminating-toil" class="menu-buttons">5. Eliminating Toil</a>
                        </li>
                        <li class="active">
                            <a href="/sre/sre-book/chapters/monitoring-distributed-systems" class="menu-buttons">6. Monitoring Distributed Systems</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/automation-at-google" class="menu-buttons">7. The Evolution of Automation at Google</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/release-engineering" class="menu-buttons">8. Release Engineering</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/simplicity" class="menu-buttons">9. Simplicity</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/part3" class="menu-buttons">Part III - Practices</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/practical-alerting" class="menu-buttons">10. Practical Alerting</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/being-on-call" class="menu-buttons">11. Being On-Call</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/effective-troubleshooting" class="menu-buttons">12. Effective Troubleshooting</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/emergency-response" class="menu-buttons">13. Emergency Response</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/managing-incidents" class="menu-buttons">14. Managing Incidents</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/postmortem-culture" class="menu-buttons">15. Postmortem Culture: Learning from Failure</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/tracking-outages" class="menu-buttons">16. Tracking Outages</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/testing-reliability" class="menu-buttons">17. Testing for Reliability</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/software-engineering-in-sre" class="menu-buttons">18. Software Engineering in SRE</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/load-balancing-frontend" class="menu-buttons">19. Load Balancing at the Frontend</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/load-balancing-datacenter" class="menu-buttons">20. Load Balancing in the Datacenter</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/handling-overload" class="menu-buttons">21. Handling Overload</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/addressing-cascading-failures" class="menu-buttons">22. Addressing Cascading Failures</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/managing-critical-state" class="menu-buttons">23. Managing Critical State: Distributed Consensus for Reliability</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/distributed-periodic-scheduling" class="menu-buttons">24. Distributed Periodic Scheduling with Cron</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/data-processing-pipelines" class="menu-buttons">25. Data Processing Pipelines</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/data-integrity" class="menu-buttons">26. Data Integrity: What You Read Is What You Wrote</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/reliable-product-launches" class="menu-buttons">27. Reliable Product Launches at Scale</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/part4" class="menu-buttons">Part IV - Management</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/accelerating-sre-on-call" class="menu-buttons">28. Accelerating SREs to On-Call and Beyond</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/dealing-with-interrupts" class="menu-buttons">29. Dealing with Interrupts</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/operational-overload" class="menu-buttons">30. Embedding an SRE to Recover from Operational Overload</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/communication-and-collaboration" class="menu-buttons">31. Communication and Collaboration in SRE</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/evolving-sre-engagement-model" class="menu-buttons">32. The Evolving SRE Engagement Model</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/part5" class="menu-buttons">Part V - Conclusions</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/lessons-learned" class="menu-buttons">33. Lessons Learned from Other Industries</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/conclusion" class="menu-buttons">34. Conclusion</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/availability-table" class="menu-buttons">Appendix A. Availability Table</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/service-best-practices" class="menu-buttons">Appendix B. A Collection of Best Practices for Production Services</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/incident-document" class="menu-buttons">Appendix C. Example Incident State Document</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/postmortem" class="menu-buttons">Appendix D. Example Postmortem</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/launch-checklist" class="menu-buttons">Appendix E. Launch Coordination Checklist</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/production-meeting" class="menu-buttons">Appendix F. Example Production Meeting Minutes</a>
                        </li>
                        <li>
                            <a href="/sre/sre-book/chapters/bibliography" class="menu-buttons">Bibliography</a>
                        </li>
                    </ol>
                </div>
            </div>
            <div id="maia-main" role="main">
                <div class="maia-teleport" id="content"></div>
                <div class="content">
                    <section data-type="chapter" id="chapter_monitoring">
                        <h1 class="heading jumptargets">
                            Monitoring Distributed Systems
                        </h1>
                        <p class="byline author">
                            Written by Rob Ewaschuk<br />
                            Edited by Betsy Beyer
                        </p>
                        <p>
                            Google’s SRE teams have some basic principles and best practices for building successful monitoring and alerting systems. This chapter offers guidelines for what issues should interrupt a human via a page, and how to deal with issues that aren’t serious enough to trigger a page.
                        </p>
                        <section data-type="sect1" id="definitions-2ksZhN">
                            <h1 class="heading jumptargets">
                                Definitions
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="terminology" id="id-DnC1SWFMhD"></a>There’s no uniformly shared vocabulary for discussing all topics related to monitoring. Even within Google, usage of the following terms varies, but the most common interpretations are listed here.
                            </p>
                            <dl>
                                <dt class="subheaders jumptargets" id="monitoring">
                                    Monitoring
                                </dt>
                                <dd>
                                    <p>
                                        Collecting, processing, aggregating, and displaying real-time quantitative data about a system, such as query counts and types, error counts and types, processing times, and server lifetimes.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="white-box-monitoring">
                                    White-box monitoring
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="white-box monitoring" id="id-9nCjSDS4tZILhX"></a>Monitoring based on metrics exposed by the internals of the system, including logs, interfaces like the Java Virtual Machine Profiling Interface, or an HTTP handler that emits internal statistics.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="black-box-monitoring">
                                    Black-box monitoring
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="black-box monitoring" id="id-zdCxSrSgTWIdhb"></a>Testing externally visible behavior as a user would see it.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="dashboard">
                                    Dashboard
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="dashboards" data-secondary="defined" id="id-VMCPS2SribIkh4"></a>An application (usually web-based) that provides a summary view of a service’s core metrics. A dashboard may have filters, selectors, and so on, but is prebuilt to expose the metrics most important to its users. The dashboard might also display team information such as ticket queue length, a list of high-priority bugs, the current on-call engineer for a given area of responsibility, or recent pushes.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="alert">
                                    Alert
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="alerts" data-secondary="defined" id="id-wqC7SvSPUAIVhQ"></a>A notification intended to be read by a human and that is pushed to a system such as a bug or ticket queue, an email alias, or a pager. Respectively, these alerts are classified as <em>tickets</em>, <em>email alerts</em>,<sup><a class="jumptarget" data-type="noteref" id="id-LvQuvtYS7UvI8h4-marker" href="#id-LvQuvtYS7UvI8h4">22</a></sup> and <em>pages</em>.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="root-cause">
                                    Root cause
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="root cause" data-secondary="defined" id="id-PnCpSaSKsgIjho"></a>A defect in a software or human system that, if repaired, instills confidence that this event won’t happen again in the same way. A given incident might have multiple root causes: for example, perhaps it was caused by a combination of insufficient process automation, software that crashed on bogus input, <em>and</em> insufficient testing of the script used to generate the configuration. Each of these factors might stand alone as a root cause, and each should be repaired.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="node-and-machine">
                                    Node and machine
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="machines" data-secondary="defined" id="id-XmC9SkSlfnI1hK"></a>Used interchangeably to indicate a single instance of a running kernel in either a physical server, virtual machine, or container. There might be multiple <em>services</em> worth monitoring on a single machine. The services may either be:
                                    </p>
                                    <ul>
                                        <li>Related to each other: for example, a caching server and a web server
                                        </li>
                                        <li>Unrelated services sharing hardware: for example, a code repository and a master for a configuration system like <a href="https://puppetlabs.com/puppet/puppet-open-source" target="_blank">Puppet</a> or <a href="https://www.chef.io/chef/" target="_blank">Chef</a>
                                        </li>
                                    </ul>
                                </dd>
                                <dt class="subheaders jumptargets" id="push">
                                    Push
                                </dt>
                                <dd>
                                    <p>
                                        Any change to a service’s running software or its configuration.
                                    </p>
                                </dd>
                            </dl>
                        </section>
                        <section data-type="sect1" id="why-monitor-pWsBTZ">
                            <h1 class="heading jumptargets">
                                Why Monitor?
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="benefits of monitoring" id="id-kVCkSpFnTl"></a>There are many reasons to monitor a system, including:
                            </p>
                            <dl>
                                <dt class="subheaders jumptargets" id="analyzing-long-term-trends">
                                    Analyzing long-term trends
                                </dt>
                                <dd>
                                    <p>
                                        How big is my database and how fast is it growing? How quickly is my daily-active user count growing?
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="comparing-over-time-or-experiment-groups">
                                    Comparing over time or experiment groups
                                </dt>
                                <dd>
                                    <p>
                                        Are queries faster with Acme Bucket of Bytes 2.72 versus Ajax DB 3.14? How much better is my memcache hit rate with an extra node? Is my site slower than it was last week?
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="alerting">
                                    Alerting
                                </dt>
                                <dd>
                                    <p>
                                        Something is broken, and somebody needs to fix it right now! Or, something might break soon, so somebody should look soon.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="building-dashboards">
                                    Building dashboards
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="dashboards" data-secondary="benefits of" id="id-rjCXSOS0iDIGT8"></a>Dashboards should answer basic questions about your service, and normally include some form of the four golden signals (discussed in <a data-type="xref" href="#xref_monitoring_golden-signals">The Four Golden Signals</a>).
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="conducting-ad-hoc-retrospective-analysis-ie-debugging">
                                    Conducting <i class="italic">ad hoc</i> retrospective analysis (i.e., debugging)
                                </dt>
                                <dd>
                                    <p>
                                        Our latency just shot up; what else happened around the same time?
                                    </p>
                                </dd>
                            </dl>
                            <p>
                                System monitoring is also helpful in supplying raw input into business analytics and in facilitating analysis of security breaches. Because this book focuses on the engineering domains in which SRE has particular expertise, we won’t discuss these applications of monitoring here.
                            </p>
                            <p>
                                Monitoring and alerting enables a system to tell us when it’s broken, or perhaps to tell us what’s about to break. When the system isn’t able to automatically fix itself, we want a human to investigate the alert, determine if there’s a real problem at hand, mitigate the problem, and determine the root cause of the problem. Unless you’re performing security auditing on very narrowly scoped components of a system, you should never trigger an alert simply because "something seems a bit weird."
                            </p>
                            <p>
                                Paging a human is a quite expensive use of an employee’s time. If an employee is at work, a page interrupts their workflow. If the employee is at home, a page interrupts their personal time, and perhaps even their sleep. When pages occur too frequently, employees second-guess, skim, or even ignore incoming alerts, sometimes even ignoring a "real" page that’s masked by the noise. Outages can be prolonged because other noise interferes with a rapid diagnosis and fix. Effective alerting systems have good signal and very low noise.
                            </p>
                        </section>
                        <section data-type="sect1" id="setting-reasonable-expectations-for-monitoring-o8svcM">
                            <h1 class="heading jumptargets">
                                Setting Reasonable Expectations for Monitoring
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="setting expectations for" id="id-4nCqSYFQcE"></a>Monitoring a complex application is a significant engineering endeavor in and of itself. Even with substantial existing infrastructure for instrumentation, collection, display, and alerting in place, a Google SRE team with 10–12 members typically has one or sometimes two members whose primary assignment is to build and maintain monitoring systems for their service. This number has decreased over time as we generalize and centralize common monitoring infrastructure, but every SRE team typically has at least one “monitoring person.” (That being said, while it can be fun to have access to traffic graph dashboards and the like, SRE teams carefully avoid any situation that requires someone to “stare at a screen to watch for problems.”)
                            </p>
                            <p>
                                <a data-type="indexterm" data-primary="post hoc analysis" id="id-JnCDSjIVcG"></a>In general, Google has trended toward simpler and faster monitoring systems, with better tools for <em>post hoc</em> analysis. We avoid "magic" systems that try to learn thresholds or automatically detect causality. Rules that detect unexpected changes in end-user request rates are one counterexample; while these rules are still kept as simple as possible, they give a very quick detection of a very simple, specific, severe anomaly. Other uses of monitoring data such as capacity planning and traffic prediction can tolerate more fragility, and thus, more complexity. Observational experiments conducted over a very long time horizon (months or years) with a low sampling rate (hours or days) can also often tolerate more fragility because occasional missed samples won’t hide a long-running trend.
                            </p>
                            <p>
                                <a data-type="indexterm" data-primary="dependency hierarchies" id="id-9nCjSOtmcj"></a>Google SRE has experienced only limited success with complex dependency hierarchies. We seldom use rules such as, "If I know the database is slow, alert for a slow database; otherwise, alert for the website being generally slow." Dependency-reliant rules usually pertain to very stable parts of our system, such as our system for draining user traffic away from a datacenter. For example, "If a datacenter is drained, then don’t alert me on its latency" is one common datacenter alerting rule. Few teams at Google maintain complex dependency hierarchies because our infrastructure has a steady rate of continuous refactoring.
                            </p>
                            <p>
                                Some of the ideas described in this chapter are still aspirational: there is always room to move more rapidly from symptom to root cause(s), especially in ever-changing systems. So while this chapter sets out some goals for monitoring systems, and some ways to achieve these goals, it’s important that monitoring systems—especially the critical path from the onset of a production problem, through a page to a human, through basic triage and deep debugging—be kept simple and comprehensible by everyone on the team.
                            </p>
                            <p>
                                Similarly, to keep noise low and signal high, the elements of your monitoring system that direct to a pager need to be very simple and robust. Rules that generate alerts for humans should be simple to understand and represent a clear failure.
                            </p>
                        </section>
                        <section data-type="sect1" id="symptoms-versus-causes-g0sEi4">
                            <h1 class="heading jumptargets">
                                Symptoms Versus Causes
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="symptoms vs. causes" id="id-JnCDSlFmiG"></a>Your monitoring system should address two questions: what’s broken, and why?
                            </p>
                            <p>
                                The "what’s broken" indicates the symptom; the "why" indicates a (possibly intermediate) cause. <a data-type="xref" href="#table_monitoring_symptoms">Table 6-1</a> lists some hypothetical symptoms and corresponding causes.
                            </p>
                            <table id="table_monitoring_symptoms" class="pagebreak-before">
                                <caption class="jumptarget">
                                    <span class="label">Table 6-1.</span> Example symptoms and causes
                                </caption>
                                <thead>
                                    <tr>
                                        <th>
                                            <strong>Symptom</strong>
                                        </th>
                                        <th>
                                            <strong>Cause</strong>
                                        </th>
                                    </tr>
                                </thead>
                                <tbody>
                                    <tr>
                                        <td>
                                            <p>
                                                <strong>I’m serving HTTP 500s or 404s</strong>
                                            </p>
                                        </td>
                                        <td>
                                            <p>
                                                Database servers are refusing connections
                                            </p>
                                        </td>
                                    </tr>
                                    <tr>
                                        <td>
                                            <p>
                                                <strong>My responses are slow</strong>
                                            </p>
                                        </td>
                                        <td>
                                            <p>
                                                CPUs are overloaded by a bogosort, or an Ethernet cable is crimped under a rack, visible as partial packet loss
                                            </p>
                                        </td>
                                    </tr>
                                    <tr>
                                        <td>
                                            <p>
                                                <strong>Users in Antarctica aren’t receiving animated cat GIFs</strong>
                                            </p>
                                        </td>
                                        <td>
                                            <p>
                                                Your Content Distribution Network hates scientists and felines, and thus blacklisted some client IPs
                                            </p>
                                        </td>
                                    </tr>
                                    <tr>
                                        <td>
                                            <p>
                                                <strong>Private content is world-readable</strong>
                                            </p>
                                        </td>
                                        <td>
                                            <p>
                                                A new software push caused ACLs to be forgotten and allowed all requests
                                            </p>
                                        </td>
                                    </tr>
                                </tbody>
                            </table>
                            <p>
                                "What" versus "why" is one of the most important distinctions in writing good monitoring with maximum signal and minimum noise.
                            </p>
                        </section>
                        <section data-type="sect1" id="black-box-versus-white-box-q8sJuw">
                            <h1 class="heading jumptargets">
                                Black-Box Versus White-Box
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="blackbox vs. whitebox" id="id-9nCjSvFVuj"></a><a data-type="indexterm" data-primary="white-box monitoring" id="id-ZbC1FMFEu7"></a><a data-type="indexterm" data-primary="black-box monitoring" id="id-zdCXIGFvuy"></a>We combine heavy use of white-box monitoring with modest but critical uses of black-box monitoring. The simplest way to think about black-box monitoring versus white-box monitoring is that black-box monitoring is symptom-oriented and represents active—not predicted—problems: "The system isn’t working correctly, right now." White-box monitoring depends on the ability to inspect the innards of the system, such as logs or HTTP endpoints, with instrumentation. White-box monitoring therefore allows detection of imminent problems, failures masked by retries, and so forth.
                            </p>
                            <p>
                                Note that in a multilayered system, one person’s symptom is another person’s cause. For example, suppose that a database’s performance is slow. Slow database reads are a symptom for the database SRE who detects them. However, for the frontend SRE observing a slow website, the same slow database reads are a cause. Therefore, white-box monitoring is sometimes symptom-oriented, and sometimes cause-oriented, depending on just how informative your white-box is.
                            </p>
                            <p>
                                When collecting telemetry for debugging, white-box monitoring is essential. If web servers seem slow on database-heavy requests, you need to know both how fast the web server perceives the database to be, and how fast the database believes itself to be. Otherwise, you can’t distinguish an actually slow database server from a network problem between your web server and your database.
                            </p>
                            <p>
                                For paging, black-box monitoring has the key benefit of forcing discipline to only nag a human when a problem is both already ongoing and contributing to real symptoms. On the other hand, for not-yet-occurring but imminent problems, black-box monitoring is fairly useless.
                            </p>
                        </section>
                        <section data-type="sect1" id="xref_monitoring_golden-signals">
                            <h1 class="heading jumptargets">
                                The Four Golden Signals
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="four golden signals of" id="id-ZbCxSMFjU7"></a>The four golden signals of monitoring are latency, traffic, errors, and saturation. If you can only measure four metrics of your user-facing system, focus on these four.
                            </p>
                            <dl>
                                <dt class="subheaders jumptargets" id="latency">
                                    Latency
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="service latency" data-secondary="monitoring for" id="id-yYCASJS9FKIWUb"></a><a data-type="indexterm" data-primary="latency" data-secondary="monitoring for" id="id-VMCpF2SXFbIwU4"></a><a data-type="indexterm" data-primary="request latency" id="id-rjCeIOSKFDIaU8"></a><a data-type="indexterm" data-primary="user requests" data-secondary="request latency monitoring" id="id-wqCDtvSGFAIMUQ"></a>The time it takes to service a request. It’s important to distinguish between the latency of successful requests and the latency of failed requests. For example, an HTTP 500 error triggered due to loss of connection to a database or other critical backend might be served very quickly; however, as an HTTP 500 error indicates a failed request, factoring 500s into your overall latency might result in misleading calculations. On the other hand, a slow error is even worse than a fast error! Therefore, it’s important to track error latency, as opposed to just filtering out errors.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="traffic">
                                    Traffic
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="user requests" data-secondary="traffic analysis" id="id-rjCXSOSxtDIaU8"></a><a data-type="indexterm" data-primary="traffic analysis" id="id-wqC4FvSBtAIMUQ"></a>A measure of how much demand is being placed on your system, measured in a high-level system-specific metric. For a web service, this measurement is usually HTTP requests per second, perhaps broken out by the nature of the requests (e.g., static versus dynamic content). For an audio streaming system, this measurement might focus on network I/O rate or concurrent sessions. For a key-value storage system, this measurement might be transactions and retrievals per <span class="keep-together">second</span>.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="errors">
                                    Errors
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="error rates" id="id-x1C4SjSlTLIMUJ"></a><a data-type="indexterm" data-primary="user requests" data-secondary="monitoring failures" id="id-PnCxFaS0TgIVUo"></a>The rate of requests that fail, either explicitly (e.g., HTTP 500s), implicitly (for example, an HTTP 200 success response, but coupled with the wrong content), or by policy (for example, "If you committed to one-second response times, any request over one second is an error"). Where protocol response codes are insufficient to express all failure conditions, secondary (internal) protocols may be necessary to track partial failure modes. Monitoring these cases can be drastically different: catching HTTP 500s at your load balancer can do a decent job of catching all completely failed requests, while only end-to-end system tests can detect that you’re serving the wrong content.
                                    </p>
                                </dd>
                                <dt class="subheaders jumptargets" id="saturation">
                                    Saturation
                                </dt>
                                <dd>
                                    <p>
                                        <a data-type="indexterm" data-primary="saturation" id="id-OnCNS2S4iDIYU8"></a>How "full" your service is. A measure of your system fraction, emphasizing the resources that are most constrained (e.g., in a memory-constrained system, show memory; in an I/O-constrained system, show I/O). Note that many systems degrade in performance before they achieve 100% utilization, so having a utilization target is essential.
                                    </p>
                                    <p>
                                        In complex systems, saturation can be supplemented with higher-level load measurement: can your service properly handle double the traffic, handle only 10% more traffic, or handle even less traffic than it currently receives? For very simple services that have no parameters that alter the complexity of the request (e.g., "Give me a nonce" or "I need a globally unique monotonic integer") that rarely change configuration, a static value from a load test might be adequate. As discussed in the previous paragraph, however, most services need to use indirect signals like CPU utilization or network bandwidth that have a known upper bound. Latency increases are often a leading indicator of saturation. Measuring your 99th percentile response time over some small window (e.g., one minute) can give a very early signal of saturation.
                                    </p>
                                    <p>
                                        Finally, saturation is also concerned with predictions of impending saturation, such as "It looks like your database will fill its hard drive in 4 hours."
                                    </p>
                                </dd>
                            </dl>
                            <p>
                                If you measure all four golden signals and page a human when one signal is problematic (or, in the case of saturation, nearly problematic), your service will be at least decently covered by monitoring.
                            </p>
                        </section>
                        <section data-type="sect1" id="worrying-about-your-tail-or-instrumentation-and-performance-Yms9Ck">
                            <h1 class="heading jumptargets">
                                Worrying About Your Tail (or, Instrumentation and Performance)
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="instrumentation and performance" id="id-zdCxSGFQCy"></a><a data-type="indexterm" data-primary="performance" data-secondary="monitoring" id="id-yYCyFpFdCr"></a>When building a monitoring system from scratch, it’s tempting to design a system based upon the mean of some quantity: the mean latency, the mean CPU usage of your nodes, or the mean fullness of your databases. The danger presented by the latter two cases is obvious: CPUs and databases can easily be utilized in a very imbalanced way. The same holds for latency. If you run a web service with an average latency of 100 ms at 1,000 requests per second, 1% of requests might easily take 5 seconds.<sup><a class="jumptarget" data-type="noteref" id="id-QQLuAIXFxCz-marker" href="#id-QQLuAIXFxCz">23</a></sup> If your users depend on several such web services to render their page, the 99th percentile of one backend can easily become the median response of your <span class="keep-together">frontend</span>.
                            </p>
                            <p>
                                The simplest way to differentiate between a slow average and a very slow "tail" of requests is to collect request counts bucketed by latencies (suitable for rendering a histogram), rather than actual latencies: how many requests did I serve that took between 0 ms and 10 ms, between 10 ms and 30 ms, between 30 ms and 100 ms, between 100 ms and 300 ms, and so on? Distributing the histogram boundaries approximately exponentially (in this case by factors of roughly 3) is often an easy way to visualize the distribution of your requests.
                            </p>
                        </section>
                        <section data-type="sect1" id="choosing-an-appropriate-resolution-for-measurements-vJsBsE">
                            <h1 class="heading jumptargets">
                                Choosing an Appropriate Resolution for Measurements
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="resolution" id="id-yYCASpFxsr"></a>Different aspects of a system should be measured with different levels of granularity. For example:
                            </p>
                            <ul>
                                <li>Observing CPU load over the time span of a minute won’t reveal even quite long-lived spikes that drive high tail latencies.
                                </li>
                                <li>On the other hand, for a web service targeting no more than 9 hours aggregate downtime per year (99.9% annual uptime), probing for a 200 (success) status more than once or twice a minute is probably unnecessarily frequent.
                                </li>
                                <li>Similarly, checking hard drive fullness for a service targeting 99.9% availability more than once every 1–2 minutes is probably unnecessary.
                                </li>
                            </ul>
                            <p>
                                Take care in how you structure the granularity of your measurements. Collecting per-second measurements of CPU load might yield interesting data, but such frequent measurements may be very expensive to collect, store, and analyze. If your monitoring goal calls for high resolution but doesn’t require extremely low latency, you can reduce these costs by performing internal sampling on the server, then configuring an external system to collect and aggregate that distribution over time or across servers. You might:
                            </p>
                            <ol>
                                <li>Record the current CPU utilization each second.
                                </li>
                                <li>Using buckets of 5% granularity, increment the appropriate CPU utilization bucket each second.
                                </li>
                                <li>Aggregate those values every minute.
                                </li>
                            </ol>
                            <p>
                                This strategy allows you to observe brief CPU hotspots without incurring very high cost due to collection and retention.
                            </p>
                        </section>
                        <section data-type="sect1" id="as-simple-as-possible-no-simpler-lqskHx">
                            <h1 class="heading jumptargets">
                                As Simple as Possible, No Simpler
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="avoiding complexity in" id="id-VMCPSrFpHm"></a>Piling all these requirements on top of each other can add up to a very complex monitoring system—your system might end up with the following levels of complexity:
                            </p>
                            <ul>
                                <li>Alerts on different latency thresholds, at different percentiles, on all kinds of different metrics
                                </li>
                                <li>Extra code to detect and expose possible causes
                                </li>
                                <li>Associated dashboards for each of these possible causes
                                </li>
                            </ul>
                            <p>
                                The sources of potential complexity are never-ending. Like all software systems, monitoring can become so complex that it’s fragile, complicated to change, and a maintenance burden.
                            </p>
                            <p>
                                Therefore, design your monitoring system with an eye toward simplicity. In choosing what to monitor, keep the following guidelines in mind:
                            </p>
                            <ul>
                                <li>The rules that catch real incidents most often should be as simple, predictable, and reliable as possible.
                                </li>
                                <li>Data collection, aggregation, and alerting configuration that is rarely exercised (e.g., less than once a quarter for some SRE teams) should be up for removal.
                                </li>
                                <li>Signals that are collected, but not exposed in any prebaked dashboard nor used by any alert, are candidates for removal.
                                </li>
                            </ul>
                            <p>
                                In Google’s experience, basic collection and aggregation of metrics, paired with alerting and dashboards, has worked well as a relatively standalone system. (In fact Google’s monitoring system is broken up into several binaries, but typically people learn about all aspects of these binaries.) It can be tempting to combine monitoring with other aspects of inspecting complex systems, such as detailed system profiling, single-process debugging, tracking details about exceptions or crashes, load testing, log collection and analysis, or traffic inspection. While most of these subjects share commonalities with basic monitoring, blending together too many results in overly complex and fragile systems. As in many other aspects of software engineering, maintaining distinct systems with clear, simple, loosely coupled points of integration is a better strategy (for example, using web APIs for pulling summary data in a format that can remain constant over an extended period of time).
                            </p>
                        </section>
                        <section data-type="sect1" id="tying-these-principles-together-nqsJfw">
                            <h1 class="heading jumptargets">
                                Tying These Principles Together
                            </h1>
                            <p>
                                The principles discussed in this chapter can be tied together into a philosophy on monitoring and alerting that’s widely endorsed and followed within Google SRE teams. While this monitoring philosophy is a bit aspirational, it’s a good starting point for writing or reviewing a new alert, and it can help your organization ask the right questions, regardless of the size of your organization or the complexity of your service or system.
                            </p>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="creating rules for" id="id-wqC7SDIvfj"></a>When creating rules for monitoring and alerting, asking the following questions can help you avoid false positives and pager burnout:<sup><a class="jumptarget" data-type="noteref" id="id-a82udF8IBfx-marker" href="#id-a82udF8IBfx">24</a></sup>
                            </p>
                            <ul>
                                <li>Does this rule detect <em>an otherwise undetected condition</em> that is urgent, actionable, and actively or imminently user-visible?<sup><a class="jumptarget" data-type="noteref" id="id-0vYuEFpSjSMtLfG-marker" href="#id-0vYuEFpSjSMtLfG">25</a></sup>
                                </li>
                                <li>Will I ever be able to ignore this alert, knowing it’s benign? When and why will I be able to ignore this alert, and how can I avoid this scenario?
                                </li>
                                <li>Does this alert definitely indicate that users are being negatively affected? Are there detectable cases in which users aren’t being negatively impacted, such as drained traffic or test deployments, that should be filtered out?
                                </li>
                                <li>Can I take action in response to this alert? Is that action urgent, or could it wait until morning? Could the action be safely automated? Will that action be a long-term fix, or just a short-term workaround?
                                </li>
                                <li>Are other people getting paged for this issue, therefore rendering at least one of the pages unnecessary?
                                </li>
                            </ul>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="monitoring philosophy" id="id-PnCpSwhJfa"></a>These questions reflect a fundamental philosophy on pages and pagers:
                            </p>
                            <ul>
                                <li>Every time the pager goes off, I should be able to react with a sense of urgency. I can only react with a sense of urgency a few times a day before I become fatigued.
                                </li>
                                <li>Every page should be actionable.
                                </li>
                                <li>Every page response should require intelligence. If a page merely merits a robotic response, it shouldn’t be a page.
                                </li>
                                <li>Pages should be about a novel problem or an event that hasn’t been seen before.
                                </li>
                            </ul>
                            <p>
                                Such a perspective dissipates certain distinctions: if a page satisfies the preceding four bullets, it’s irrelevant whether the page is triggered by white-box or black-box monitoring. This perspective also amplifies certain distinctions: it’s better to spend much more effort on catching symptoms than causes; when it comes to causes, only worry about very definite, very imminent causes.
                            </p>
                        </section>
                        <section data-type="sect1" id="monitoring-for-the-long-term-NbsNS8">
                            <h1 class="heading jumptargets">
                                Monitoring for the Long Term
                            </h1>
                            <p>
                                <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="challenges of" id="id-wqC7SPFMSj"></a>In modern production systems, monitoring systems track an ever-evolving system with changing software architecture, load characteristics, and performance targets. An alert that’s currently exceptionally rare and hard to automate might become frequent, perhaps even meriting a hacked-together script to resolve it. At this point, someone should find and eliminate the root causes of the problem; if such resolution isn’t possible, the alert response deserves to be fully automated.
                            </p>
                            <p>
                                It’s important that decisions about monitoring be made with long-term goals in mind. Every page that happens today distracts a human from improving the system for tomorrow, so there is often a case for taking a short-term hit to availability or performance in order to improve the long-term outlook for the system. Let’s take a look at two case studies that illustrate this trade-off.
                            </p>
                            <section data-type="sect2" id="bigtable-sre-a-tale-of-over-alerting-dbsXtjSM">
                                <h2 class="subheaders jumptargets">
                                    Bigtable SRE: A Tale of Over-Alerting
                                </h2>
                                <p>
                                    <a data-type="indexterm" id="MDSbig6" data-primary="monitoring distributed systems" data-secondary="case studies"></a><a data-type="indexterm" data-primary="Bigtable" id="id-XmCpFOFytySv"></a>Google’s internal infrastructure is typically offered and measured against a service level objective (SLO; see <a data-type="xref" href="/sre/sre-book/chapters/service-level-objectives">Service Level Objectives</a>). Many years ago, the Bigtable service’s SLO was based on a synthetic well-behaved client’s mean performance. Because of problems in Bigtable and lower layers of the storage stack, the mean performance was driven by a "large" tail: the worst 5% of requests were often significantly slower than the rest.
                                </p>
                                <p>
                                    Email alerts were triggered as the SLO approached, and paging alerts were triggered when the SLO was exceeded. Both types of alerts were firing voluminously, consuming unacceptable amounts of engineering time: the team spent significant amounts of time triaging the alerts to find the few that were really actionable, and we often missed the problems that actually affected users, because so few of them did. Many of the pages were non-urgent, due to well-understood problems in the infrastructure, and had either rote responses or received no response.
                                </p>
                                <p>
                                    To remedy the situation, the team used a three-pronged approach: while making great efforts to improve the performance of Bigtable, we also temporarily dialed back our SLO target, using the 75th percentile request latency. We also disabled email alerts, as there were so many that spending time diagnosing them was infeasible.
                                </p>
                                <p>
                                    This strategy gave us enough breathing room to actually fix the longer-term problems in Bigtable and the lower layers of the storage stack, rather than constantly fixing tactical problems. On-call engineers could actually accomplish work when they weren’t being kept up by pages at all hours. Ultimately, temporarily backing off on our alerts allowed us to make faster progress toward a better service.
                                </p>
                            </section>
                            <section data-type="sect2" id="gmail-predictable-scriptable-responses-from-humans-BVs1h4SD">
                                <h2 class="subheaders jumptargets">
                                    Gmail: Predictable, Scriptable Responses from Humans
                                </h2>
                                <p>
                                    <a data-type="indexterm" data-primary="Gmail" id="id-XmC9SOFZhySv"></a>In the very early days of Gmail, the service was built on a retrofitted distributed process management system called Workqueue, which was originally created for batch processing of pieces of the search index. Workqueue was "adapted" to long-lived processes and subsequently applied to Gmail, but certain bugs in the relatively opaque codebase in the scheduler proved hard to beat.
                                </p>
                                <p>
                                    At that time, the Gmail monitoring was structured such that alerts fired when individual tasks were “de-scheduled” by Workqueue. This setup was less than ideal because even at that time, Gmail had many, many thousands of tasks, each task representing a fraction of a percent of our users. We cared deeply about providing a good user experience for Gmail users, but such an alerting setup was unmaintainable.
                                </p>
                                <p>
                                    To address this problem, Gmail SRE built a tool that helped “poke” the scheduler in just the right way to minimize impact to users. The team had several discussions about whether or not we should simply automate the entire loop from detecting the problem to nudging the rescheduler, until a better long-term solution was achieved, but some worried this kind of workaround would delay a real fix.
                                </p>
                                <p>
                                    This kind of tension is common within a team, and often reflects an underlying mistrust of the team’s self-discipline: while some team members want to implement a “hack” to allow time for a proper fix, others worry that a hack will be forgotten or that the proper fix will be deprioritized indefinitely. This concern is credible, as it’s easy to build layers of unmaintainable technical debt by patching over problems instead of making real fixes. Managers and technical leaders play a key role in implementing true, long-term fixes by supporting and prioritizing potentially time-consuming long-term fixes even when the initial “pain” of paging subsides.
                                </p>
                                <p>
                                    Pages with rote, algorithmic responses should be a red flag. Unwillingness on the part of your team to automate such pages implies that the team lacks confidence that they can clean up their technical debt. This is a major problem worth escalating.<a data-type="indexterm" data-primary="" data-startref="MDSbig6" id="id-oPCASqT2hLSk"></a>
                                </p>
                            </section>
                            <section data-type="sect2" id="the-long-run-MQsWTMS7">
                                <h2 class="subheaders jumptargets">
                                    The Long Run
                                </h2>
                                <p>
                                    <a data-type="indexterm" data-primary="monitoring distributed systems" data-secondary="short- vs. long-term availability" id="id-jyCxSoFETNSd"></a>A common theme connects the previous examples of Bigtable and Gmail: a tension between short-term and long-term availability. Often, sheer force of effort can help a rickety system achieve high availability, but this path is usually short-lived and fraught with burnout and dependence on a small number of heroic team members. Taking a controlled, short-term decrease in availability is often a painful, but strategic trade for the long-run stability of the system. It’s important not to think of every page as an event in isolation, but to consider whether the overall <em>level</em> of paging leads toward a healthy, appropriately available system with a healthy, viable team and long-term outlook. We review statistics about page frequency (usually expressed as incidents per shift, where an incident might be composed of a few related pages) in quarterly reports with management, ensuring that decision makers are kept up to date on the pager load and overall health of their teams.
                                </p>
                            </section>
                        </section>
                        <section data-type="sect1" id="conclusion-8ksvFj">
                            <h1 class="heading jumptargets">
                                Conclusion
                            </h1>
                            <p>
                                A healthy monitoring and alerting pipeline is simple and easy to reason about. It focuses primarily on symptoms for paging, reserving cause-oriented heuristics to serve as aids to debugging problems. Monitoring symptoms is easier the further "up" your stack you monitor, though monitoring saturation and performance of subsystems such as databases often must be performed directly on the subsystem itself. Email alerts are of very limited value and tend to easily become overrun with noise; instead, you should favor a dashboard that monitors all ongoing subcritical problems for the sort of information that typically ends up in email alerts. A dashboard might also be paired with a log, in order to analyze historical correlations.
                            </p>
                            <p>
                                Over the long haul, achieving a successful on-call rotation and product includes choosing to alert on symptoms or imminent real problems, adapting your targets to goals that are actually achievable, and making sure that your monitoring supports rapid diagnosis.
                            </p>
                        </section>
                        <div class="footnotes" data-type="footnotes">
                            <p data-type="footnote" id="id-LvQuvtYS7UvI8h4">
                                <sup><a class="jumptargets" href="#id-LvQuvtYS7UvI8h4-marker">22</a></sup>Sometimes known as "alert spam," as they are rarely read or acted on.
                            </p>
                            <p data-type="footnote" id="id-QQLuAIXFxCz">
                                <sup><a class="jumptargets" href="#id-QQLuAIXFxCz-marker">23</a></sup>If 1% of your requests are 50x the average, it means that the rest of your requests are about twice as fast as the average. But if you’re not measuring your distribution, the idea that most of your requests are near the mean is just hopeful thinking.
                            </p>
                            <p data-type="footnote" id="id-a82udF8IBfx">
                                <sup><a class="jumptargets" href="#id-a82udF8IBfx-marker">24</a></sup>See <em>Applying Cardiac Alarm Management Techniques to Your On-Call</em> <a data-type="xref" href="/sre/sre-book/chapters/bibliography#Hol14" target="_blank">[Hol14]</a> for an example of alert fatigue in another context.
                            </p>
                            <p data-type="footnote" id="id-0vYuEFpSjSMtLfG">
                                <sup><a class="jumptargets" href="#id-0vYuEFpSjSMtLfG-marker">25</a></sup>Zero-redundancy (<em>N</em> + 0) situations count as imminent, as do "nearly full" parts of your service! For more details about the concept of redundancy, see <a href="https://en.wikipedia.org/wiki/N%2B1_redundancy" target="_blank"><em class="hyperlink">https://en.wikipedia.org/wiki/N%2B1_redundancy</em></a>.
                            </p>
                        </div>
                    </section>
                </div>
            </div>
            <div class="footer">
                <div class="maia-aux">
                    <div class="previous">
                        <a href="/sre/sre-book/chapters/eliminating-toil">
                        <p class="footer-caption">
                            Previous
                        </p>
                        <p class="chapter-link">
                            Chapter 5 - Eliminating Toil
                        </p></a>
                    </div>
                    <div class="next">
                        <a href="/sre/sre-book/chapters/automation-at-google">
                        <p class="footer-caption">
                            Next
                        </p>
                        <p class="chapter-link">
                            Chapter 7 - The Evolution of Automation at Google
                        </p></a>
                    </div>
                    <p class="footer-link">
                        Copyright © 2017 Google, Inc. Published by O'Reilly Media, Inc. Licensed under <a href="https://creativecommons.org/licenses/by-nc-nd/4.0/" target="_blank">CC BY-NC-ND 4.0</a>
                    </p>
                </div>
            </div>
        </main>
        <script src="//ajax.googleapis.com/ajax/libs/angularjs/1.6.6/angular.min.js"></script> 
        <script src="//ajax.googleapis.com/ajax/libs/angularjs/1.6.6/angular-animate.min.js"></script> 
        <script src="//ajax.googleapis.com/ajax/libs/angularjs/1.6.6/angular-touch.min.js"></script> 
        <script src="/sre/sre-book/static/js/index.min.js?cache=5b7f90b"></script>
    </body>
</html>