-
Notifications
You must be signed in to change notification settings - Fork 596
HDDS-14990. Show failed volumes in ozone admin datanode list output and SCM metrics
#10058
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
f6c1b57
4ab01e6
c4d22d3
b48fb63
008d8ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
|
|
||
| import com.google.common.base.Strings; | ||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.Comparator; | ||
| import java.util.List; | ||
|
|
@@ -65,6 +66,11 @@ public class ListInfoSubcommand extends ScmSubcommand { | |
| defaultValue = "false") | ||
| private boolean json; | ||
|
|
||
| @CommandLine.Option(names = {"--failed-volumes"}, | ||
| description = "Only show datanodes that have at least one failed volume.", | ||
| defaultValue = "false") | ||
| private boolean failedVolumesOnly; | ||
|
|
||
|
Comment on lines
+68
to
+72
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The --nodes-with-failed-volumes filter is silently ignored when --node-id is used. |
||
| @CommandLine.ArgGroup(exclusive = true, multiplicity = "0..1") | ||
| private ExclusiveNodeOptions exclusiveNodeOptions; | ||
|
|
||
|
|
@@ -92,7 +98,8 @@ public void execute(ScmClient scmClient) throws IOException { | |
| Integer healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null; | ||
| BasicDatanodeInfo singleNodeInfo = new BasicDatanodeInfo.Builder( | ||
| DatanodeDetails.getFromProtoBuf(node.getNodeID()), node.getNodeOperationalStates(0), | ||
| node.getNodeStates(0)).withVolumeCounts(totalVolumeCount, healthyVolumeCount).build(); | ||
| node.getNodeStates(0)).withVolumeCounts(totalVolumeCount, healthyVolumeCount) | ||
| .withFailedVolumes(getFailedVolumes(node)).build(); | ||
|
||
| if (json) { | ||
| List<BasicDatanodeInfo> dtoList = Collections.singletonList(singleNodeInfo); | ||
| System.out.println(JsonUtils.toJsonStringWithDefaultPrettyPrinter(dtoList)); | ||
|
|
@@ -118,6 +125,10 @@ public void execute(ScmClient scmClient) throws IOException { | |
| allNodes = allNodes.filter(p -> p.getHealthState().toString() | ||
| .compareToIgnoreCase(nodeState) == 0); | ||
| } | ||
| if (failedVolumesOnly) { | ||
| allNodes = allNodes.filter(p -> | ||
| p.getFailedVolumes() != null && !p.getFailedVolumes().isEmpty()); | ||
| } | ||
|
|
||
| if (!listLimitOptions.isAll()) { | ||
| allNodes = allNodes.limit(listLimitOptions.getLimit()); | ||
|
|
@@ -160,7 +171,8 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient) | |
| DatanodeDetails.getFromProtoBuf(node.getNodeID()), | ||
| node.getNodeOperationalStates(0), node.getNodeStates(0)) | ||
| .withUsageInfo(used, capacity, percentUsed) | ||
| .withVolumeCounts(totalVolumeCount, healthyVolumeCount).build(); | ||
| .withVolumeCounts(totalVolumeCount, healthyVolumeCount) | ||
| .withFailedVolumes(getFailedVolumes(node)).build(); | ||
| } catch (Exception e) { | ||
| String reason = "Could not process info for an unknown datanode"; | ||
| if (p != null && p.getNode() != null && !Strings.isNullOrEmpty(p.getNode().getUuid())) { | ||
|
|
@@ -182,11 +194,24 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient) | |
| Integer healthyVolumeCount = p.hasHealthyVolumeCount() ? p.getHealthyVolumeCount() : null; | ||
| return new BasicDatanodeInfo.Builder( | ||
| DatanodeDetails.getFromProtoBuf(p.getNodeID()), p.getNodeOperationalStates(0), p.getNodeStates(0)) | ||
| .withVolumeCounts(totalVolumeCount, healthyVolumeCount).build(); }) | ||
| .withVolumeCounts(totalVolumeCount, healthyVolumeCount) | ||
| .withFailedVolumes(getFailedVolumes(p)).build(); }) | ||
| .sorted(Comparator.comparing(BasicDatanodeInfo::getHealthState)) | ||
| .collect(Collectors.toList()); | ||
| } | ||
|
|
||
| private static List<String> getFailedVolumes(HddsProtos.Node node) { | ||
| int count = node.getFailedVolumesCount(); | ||
| if (count == 0) { | ||
| return Collections.emptyList(); | ||
| } | ||
| List<String> result = new ArrayList<>(count); | ||
| for (int i = 0; i < count; i++) { | ||
| result.add(node.getFailedVolumes(i)); | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| private void printDatanodeInfo(BasicDatanodeInfo dn) { | ||
| StringBuilder pipelineListInfo = new StringBuilder(); | ||
| DatanodeDetails datanode = dn.getDatanodeDetails(); | ||
|
|
@@ -210,6 +235,7 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) { | |
| } | ||
| } else { | ||
| pipelineListInfo.append("No pipelines in cluster."); | ||
| pipelineListInfo.append(System.getProperty("line.separator")); | ||
|
||
| } | ||
| System.out.println("Datanode: " + datanode.getUuid().toString() + | ||
| " (" + datanode.getNetworkLocation() + "/" + datanode.getIpAddress() | ||
|
|
@@ -221,6 +247,12 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) { | |
| System.out.println("Total volume count: " + dn.getTotalVolumeCount() + "\n" + | ||
| "Healthy volume count: " + dn.getHealthyVolumeCount()); | ||
| } | ||
| if (dn.getFailedVolumes() != null && !dn.getFailedVolumes().isEmpty()) { | ||
| System.out.println("Failed volumes:"); | ||
| for (String vol : dn.getFailedVolumes()) { | ||
| System.out.println(" " + vol); | ||
| } | ||
| } | ||
| System.out.println("Related pipelines:\n" + pipelineListInfo); | ||
|
|
||
| if (dn.getUsed() != null && dn.getCapacity() != null && dn.getUsed() >= 0 && dn.getCapacity() > 0) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -357,6 +357,50 @@ public void testVolumeCounters() throws Exception { | |
| assertTrue(output.contains("Healthy volume count:"), "Should display healthy volume count"); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFailedVolumesFilter() throws Exception { | ||
| ScmClient scmClient = mock(ScmClient.class); | ||
| List<HddsProtos.Node> baseNodes = getNodeDetails(); | ||
|
|
||
| List<HddsProtos.Node> nodes = new ArrayList<>(); | ||
| // node 0: 1 failed volume | ||
| nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(0)) | ||
| .setTotalVolumeCount(4).setHealthyVolumeCount(3) | ||
| .addFailedVolumes("/data/disk2").build()); | ||
| // node 1: healthy, no failed volumes | ||
| nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(1)) | ||
| .setTotalVolumeCount(4).setHealthyVolumeCount(4).build()); | ||
| // node 2: 2 failed volumes | ||
| nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(2)) | ||
| .setTotalVolumeCount(6).setHealthyVolumeCount(4) | ||
| .addFailedVolumes("/data/disk1") | ||
| .addFailedVolumes("/data/disk5").build()); | ||
| // node 3: healthy, no failed volumes | ||
| nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(3)) | ||
| .setTotalVolumeCount(4).setHealthyVolumeCount(4).build()); | ||
|
|
||
| when(scmClient.queryNode(any(), any(), any(), any())).thenReturn(nodes); | ||
| when(scmClient.listPipelines()).thenReturn(new ArrayList<>()); | ||
|
|
||
| CommandLine c = new CommandLine(cmd); | ||
| c.parseArgs("--failed-volumes"); | ||
| cmd.execute(scmClient); | ||
| String output = outContent.toString(DEFAULT_ENCODING); | ||
|
|
||
| // Only 2 datanodes (those with failed volumes) should appear | ||
| Matcher m = Pattern.compile("^Datanode:", Pattern.MULTILINE) | ||
| .matcher(output); | ||
| int count = 0; | ||
| while (m.find()) { | ||
| count++; | ||
| } | ||
| assertEquals(2, count, "Only datanodes with failed volumes should be listed"); | ||
| assertTrue(output.contains("Failed volume")); | ||
| assertTrue(output.contains("/data/disk2")); | ||
| assertTrue(output.contains("/data/disk1")); | ||
| assertTrue(output.contains("/data/disk5")); | ||
|
||
| } | ||
|
|
||
| private void validateOrdering(JsonNode root, String orderDirection) { | ||
| for (int i = 0; i < root.size() - 1; i++) { | ||
| long usedCurrent = root.get(i).get("used").asLong(); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would
--with-failed-volumebe better?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about
--show-failed-volumes? --with-failed-volume seems mean that "display along with bad disk information".There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--show-failed-volumesimplies failed volumes are normally hidden, and included in the output only if this flag is used.I think the command
reads better.
If we want to be extra clear, it can be
--nodes-with-failed-volume.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update to
--nodes-with-failed-volume