diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 09480009455d..13ea52924383 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.hadoop.hdds.conf.ConfigurationSource; @@ -39,6 +40,7 @@ import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.apache.hadoop.util.Time; /** * Abstract class for Container Safe mode exit rule. @@ -136,7 +138,13 @@ public double getCurrentContainerThreshold() { @Override public synchronized void refresh(boolean forceRefresh) { if (forceRefresh || !validate()) { - initializeRule(); + final long startNanos = Time.monotonicNowNanos(); + try { + initializeRule(); + } finally { + long durationMs = TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - startNanos); + getSafeModeMetrics().setLastContainerSafeModeRuleRefreshDurationMs(getContainerType(), durationMs); + } } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 2c9173b2bf09..65e52ec42723 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; import org.apache.hadoop.hdds.server.events.EventQueue; +import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,6 +90,9 @@ public class SCMSafeModeManager implements SafeModeManager { private ScheduledExecutorService safeModeLogExecutor; private ScheduledFuture safeModeLogTask; + /** Monotonic time when SCM entered safe mode; used to report exit duration. */ + private long safeModeEnteredAtNanos = -1L; + public SCMSafeModeManager(final ConfigurationSource conf, final NodeManager nodeManager, final PipelineManager pipelineManager, @@ -120,6 +124,9 @@ public SCMSafeModeManager(final ConfigurationSource conf, } public void start() { + if (getInSafeMode()) { + safeModeEnteredAtNanos = Time.monotonicNowNanos(); + } emitSafeModeStatus(); startSafeModePeriodicLogger(); } @@ -177,13 +184,18 @@ public synchronized void validateSafeModeExitRules(String ruleName) { LOG.info("ScmSafeModeManager, all rules are successfully validated"); LOG.info("SCM exiting safe mode."); emitSafeModeStatus(); + recordSafeModeExitDuration(); } } public void forceExitSafeMode() { + boolean wasInSafeMode = getInSafeMode(); LOG.info("SCM force-exiting safe mode."); status.set(SafeModeStatus.OUT_OF_SAFE_MODE); emitSafeModeStatus(); + if (wasInSafeMode) { + recordSafeModeExitDuration(); + } } /** @@ -308,6 +320,17 @@ private synchronized void logSafeModeStatus() { } } + private void recordSafeModeExitDuration() { + if (safeModeEnteredAtNanos < 0) { + return; + } + long durationMs = + TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - safeModeEnteredAtNanos); + safeModeEnteredAtNanos = -1; + safeModeMetrics.setScmSafeModeExitDurationMs(durationMs); + LOG.info("SCM safe mode exit duration {} ms (since start() while in safe mode)", durationMs); + } + /** * Stops the periodic safe mode logger. * Called when safe mode exits. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java index ae65eafcb910..39ed3fc77047 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java @@ -59,6 +59,13 @@ public class SafeModeMetrics { @Metric private MutableGaugeLong numRequiredDatanodesThreshold; @Metric private MutableCounterLong currentRegisteredDatanodesCount; + @Metric("Wall-clock time (ms) SCM spent in safe mode for the last exit") + private MutableGaugeLong scmSafeModeExitDurationMs; + @Metric("Duration (ms) of the last Ratis container safe mode rule incremental refresh") + private MutableGaugeLong lastRatisContainerSafeModeRuleRefreshDurationMs; + @Metric("Duration (ms) of the last EC container safe mode rule incremental refresh") + private MutableGaugeLong lastEcContainerSafeModeRuleRefreshDurationMs; + public static SafeModeMetrics create() { final MetricsSystem ms = DefaultMetricsSystem.instance(); return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new SafeModeMetrics()); @@ -113,6 +120,24 @@ public void incCurrentRegisteredDatanodesCount() { this.currentRegisteredDatanodesCount.incr(); } + public void setScmSafeModeExitDurationMs(long durationMs) { + this.scmSafeModeExitDurationMs.set(durationMs); + } + + public void setLastContainerSafeModeRuleRefreshDurationMs( + HddsProtos.ReplicationType type, long durationMs) { + switch (type) { + case RATIS: + this.lastRatisContainerSafeModeRuleRefreshDurationMs.set(durationMs); + break; + case EC: + this.lastEcContainerSafeModeRuleRefreshDurationMs.set(durationMs); + break; + default: + break; + } + } + MutableGaugeLong getNumHealthyPipelinesThreshold() { return numHealthyPipelinesThreshold; } diff --git a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM Safemode.json b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM Safemode.json index ac0c291b83a6..5cbc09a2fec8 100644 --- a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM Safemode.json +++ b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM Safemode.json @@ -747,6 +747,209 @@ ], "title": "Registered DataNodes: Target vs Actual", "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "id": 200, + "panels": [], + "title": "SCM Safemode: Durations", + "type": "row" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "decimals": 0, + "unit": "ms", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Duration", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, + "id": 201, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "safe_mode_metrics_scm_safe_mode_exit_duration_ms", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{hostname}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Last safe mode exit duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "decimals": 0, + "unit": "ms", + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Duration", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 }, + "id": 202, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "safe_mode_metrics_last_ratis_container_safe_mode_rule_refresh_duration_ms", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{hostname}} Ratis", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "safe_mode_metrics_last_ec_container_safe_mode_rule_refresh_duration_ms", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{hostname}} EC", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Last container rule refresh duration", + "type": "timeseries" } ], "preload": false,