Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hadoop-hdds/interface-client/src/main/proto/hdds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ message Node {
repeated NodeOperationalState nodeOperationalStates = 3;
optional int32 totalVolumeCount = 4;
optional int32 healthyVolumeCount = 5;
repeated string failedVolumes = 6;
}

message NodePool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ public void getMetrics(MetricsCollector collector, boolean all) {
Integer.parseInt(nonWritableNodes));
}

String volumeFailures = nodeStatistics.get("VolumeFailures");
if (volumeFailures != null) {
metrics.addGauge(
Interns.info("VolumeFailures",
"Number of datanodes with at least one failed volume"),
Integer.parseInt(volumeFailures));
}

for (Map.Entry<String, Long> e : nodeInfo.entrySet()) {
metrics.addGauge(
Interns.info(e.getKey(), diskMetricDescription(e.getKey())),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionSummary;
import org.apache.hadoop.hdds.protocol.proto.ReconfigureProtocolProtos.ReconfigureProtocolService;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.StorageReportProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
Expand Down Expand Up @@ -655,6 +656,7 @@ public List<HddsProtos.Node> queryNode(
if (datanodeInfo != null) {
nodeBuilder.setTotalVolumeCount(datanodeInfo.getStorageReports().size());
nodeBuilder.setHealthyVolumeCount(datanodeInfo.getHealthyVolumeCount());
addFailedVolumes(nodeBuilder, datanodeInfo);
}
result.add(nodeBuilder.build());
}
Expand Down Expand Up @@ -687,6 +689,7 @@ public HddsProtos.Node queryNode(UUID uuid)
if (datanodeInfo != null) {
nodeBuilder.setTotalVolumeCount(datanodeInfo.getStorageReports().size());
nodeBuilder.setHealthyVolumeCount(datanodeInfo.getHealthyVolumeCount());
addFailedVolumes(nodeBuilder, datanodeInfo);
}
result = nodeBuilder.build();
}
Expand All @@ -702,6 +705,15 @@ public HddsProtos.Node queryNode(UUID uuid)
return result;
}

private static void addFailedVolumes(HddsProtos.Node.Builder nodeBuilder,
DatanodeInfo datanodeInfo) {
for (StorageReportProto report : datanodeInfo.getStorageReports()) {
if (report.hasFailed() && report.getFailed()) {
nodeBuilder.addFailedVolumes(report.getStorageLocation());
}
}
}

@Override
public List<DatanodeAdminError> decommissionNodes(List<String> nodes, boolean force)
throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ public final class BasicDatanodeInfo {
private Integer totalVolumeCount = null;
@JsonInclude(JsonInclude.Include.NON_NULL)
private Integer healthyVolumeCount = null;
@JsonInclude(JsonInclude.Include.NON_EMPTY)
private List<String> failedVolumes = null;

private BasicDatanodeInfo(Builder builder) {
this.dn = builder.dn;
Expand All @@ -51,6 +53,7 @@ private BasicDatanodeInfo(Builder builder) {
this.percentUsed = builder.percentUsed;
this.totalVolumeCount = builder.totalVolumeCount;
this.healthyVolumeCount = builder.healthyVolumeCount;
this.failedVolumes = builder.failedVolumes;
}

/**
Expand All @@ -65,6 +68,7 @@ public static class Builder {
private Double percentUsed;
private Integer totalVolumeCount;
private Integer healthyVolumeCount;
private List<String> failedVolumes;

public Builder(DatanodeDetails dn, HddsProtos.NodeOperationalState opState,
HddsProtos.NodeState healthState) {
Expand All @@ -86,6 +90,11 @@ public Builder withVolumeCounts(Integer total, Integer healthy) {
return this;
}

public Builder withFailedVolumes(List<String> volumes) {
this.failedVolumes = volumes;
return this;
}

public BasicDatanodeInfo build() {
return new BasicDatanodeInfo(this);
}
Expand Down Expand Up @@ -206,6 +215,11 @@ public Integer getHealthyVolumeCount() {
return healthyVolumeCount;
}

@JsonProperty(index = 112)
public List<String> getFailedVolumes() {
return failedVolumes;
}

@JsonIgnore
public DatanodeDetails getDatanodeDetails() {
return dn;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import com.google.common.base.Strings;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
Expand Down Expand Up @@ -65,6 +66,11 @@ public class ListInfoSubcommand extends ScmSubcommand {
defaultValue = "false")
private boolean json;

@CommandLine.Option(names = {"--failed-volumes"},
description = "Only show datanodes that have at least one failed volume.",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would --with-failed-volume be better?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about --show-failed-volumes ? --with-failed-volume seems mean that "display along with bad disk information".

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--show-failed-volumes implies failed volumes are normally hidden, and included in the output only if this flag is used.

I think the command

ozone admin datanode list --with-failed-volume

reads better.

If we want to be extra clear, it can be --nodes-with-failed-volume.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update to --nodes-with-failed-volume

defaultValue = "false")
private boolean failedVolumesOnly;

Comment on lines +68 to +72
Copy link
Copy Markdown
Contributor

@sreejasahithi sreejasahithi Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The --nodes-with-failed-volumes filter is silently ignored when --node-id is used.
We should make these options mutually exclusive to avoid confusion when a user provides both in the command but receives the result for the node whose ID was specified, regardless of whether it has failed volumes.

@CommandLine.ArgGroup(exclusive = true, multiplicity = "0..1")
private ExclusiveNodeOptions exclusiveNodeOptions;

Expand Down Expand Up @@ -92,7 +98,8 @@ public void execute(ScmClient scmClient) throws IOException {
Integer healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
BasicDatanodeInfo singleNodeInfo = new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(node.getNodeID()), node.getNodeOperationalStates(0),
node.getNodeStates(0)).withVolumeCounts(totalVolumeCount, healthyVolumeCount).build();
node.getNodeStates(0)).withVolumeCounts(totalVolumeCount, healthyVolumeCount)
.withFailedVolumes(getFailedVolumes(node)).build();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to suggest a small refactoring to reduce duplication of code related to BasicDatanodeInfo creation.

HDDS-14990.patch

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

if (json) {
List<BasicDatanodeInfo> dtoList = Collections.singletonList(singleNodeInfo);
System.out.println(JsonUtils.toJsonStringWithDefaultPrettyPrinter(dtoList));
Expand All @@ -118,6 +125,10 @@ public void execute(ScmClient scmClient) throws IOException {
allNodes = allNodes.filter(p -> p.getHealthState().toString()
.compareToIgnoreCase(nodeState) == 0);
}
if (failedVolumesOnly) {
allNodes = allNodes.filter(p ->
p.getFailedVolumes() != null && !p.getFailedVolumes().isEmpty());
}

if (!listLimitOptions.isAll()) {
allNodes = allNodes.limit(listLimitOptions.getLimit());
Expand Down Expand Up @@ -160,7 +171,8 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient)
DatanodeDetails.getFromProtoBuf(node.getNodeID()),
node.getNodeOperationalStates(0), node.getNodeStates(0))
.withUsageInfo(used, capacity, percentUsed)
.withVolumeCounts(totalVolumeCount, healthyVolumeCount).build();
.withVolumeCounts(totalVolumeCount, healthyVolumeCount)
.withFailedVolumes(getFailedVolumes(node)).build();
} catch (Exception e) {
String reason = "Could not process info for an unknown datanode";
if (p != null && p.getNode() != null && !Strings.isNullOrEmpty(p.getNode().getUuid())) {
Expand All @@ -182,11 +194,24 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient)
Integer healthyVolumeCount = p.hasHealthyVolumeCount() ? p.getHealthyVolumeCount() : null;
return new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(p.getNodeID()), p.getNodeOperationalStates(0), p.getNodeStates(0))
.withVolumeCounts(totalVolumeCount, healthyVolumeCount).build(); })
.withVolumeCounts(totalVolumeCount, healthyVolumeCount)
.withFailedVolumes(getFailedVolumes(p)).build(); })
.sorted(Comparator.comparing(BasicDatanodeInfo::getHealthState))
.collect(Collectors.toList());
}

private static List<String> getFailedVolumes(HddsProtos.Node node) {
int count = node.getFailedVolumesCount();
if (count == 0) {
return Collections.emptyList();
}
List<String> result = new ArrayList<>(count);
for (int i = 0; i < count; i++) {
result.add(node.getFailedVolumes(i));
}
return result;
}

private void printDatanodeInfo(BasicDatanodeInfo dn) {
StringBuilder pipelineListInfo = new StringBuilder();
DatanodeDetails datanode = dn.getDatanodeDetails();
Expand All @@ -210,6 +235,7 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) {
}
} else {
pipelineListInfo.append("No pipelines in cluster.");
pipelineListInfo.append(System.getProperty("line.separator"));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? pipelineListInfo is output with println below.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to maintain consistency with the pipelineListInfo in the previous branches, where pipelineListInfos always have line breaks.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the info. Then please change to:

pipelineListInfo.append("No pipelines in cluster.")
    .append(System.getProperty("line.separator"));

due to recently added PMD rule (HDDS-14934).

(BTW, I think using \n would be fine for now, it is already used in few places instead of System.getProperty("line.separator"), both in this file and in 122 others.)

}
System.out.println("Datanode: " + datanode.getUuid().toString() +
" (" + datanode.getNetworkLocation() + "/" + datanode.getIpAddress()
Expand All @@ -221,6 +247,12 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) {
System.out.println("Total volume count: " + dn.getTotalVolumeCount() + "\n" +
"Healthy volume count: " + dn.getHealthyVolumeCount());
}
if (dn.getFailedVolumes() != null && !dn.getFailedVolumes().isEmpty()) {
System.out.println("Failed volumes:");
for (String vol : dn.getFailedVolumes()) {
System.out.println(" " + vol);
}
}
System.out.println("Related pipelines:\n" + pipelineListInfo);

if (dn.getUsed() != null && dn.getCapacity() != null && dn.getUsed() >= 0 && dn.getCapacity() > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,50 @@ public void testVolumeCounters() throws Exception {
assertTrue(output.contains("Healthy volume count:"), "Should display healthy volume count");
}

@Test
public void testFailedVolumesFilter() throws Exception {
ScmClient scmClient = mock(ScmClient.class);
List<HddsProtos.Node> baseNodes = getNodeDetails();

List<HddsProtos.Node> nodes = new ArrayList<>();
// node 0: 1 failed volume
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(0))
.setTotalVolumeCount(4).setHealthyVolumeCount(3)
.addFailedVolumes("/data/disk2").build());
// node 1: healthy, no failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(1))
.setTotalVolumeCount(4).setHealthyVolumeCount(4).build());
// node 2: 2 failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(2))
.setTotalVolumeCount(6).setHealthyVolumeCount(4)
.addFailedVolumes("/data/disk1")
.addFailedVolumes("/data/disk5").build());
// node 3: healthy, no failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(3))
.setTotalVolumeCount(4).setHealthyVolumeCount(4).build());

when(scmClient.queryNode(any(), any(), any(), any())).thenReturn(nodes);
when(scmClient.listPipelines()).thenReturn(new ArrayList<>());

CommandLine c = new CommandLine(cmd);
c.parseArgs("--failed-volumes");
cmd.execute(scmClient);
String output = outContent.toString(DEFAULT_ENCODING);

// Only 2 datanodes (those with failed volumes) should appear
Matcher m = Pattern.compile("^Datanode:", Pattern.MULTILINE)
.matcher(output);
int count = 0;
while (m.find()) {
count++;
}
assertEquals(2, count, "Only datanodes with failed volumes should be listed");
assertTrue(output.contains("Failed volume"));
assertTrue(output.contains("/data/disk2"));
assertTrue(output.contains("/data/disk1"));
assertTrue(output.contains("/data/disk5"));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: please use assertThat(output).contains(...) for better failure message (see HDDS-9951)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated.

}

private void validateOrdering(JsonNode root, String orderDirection) {
for (int i = 0; i < root.size() - 1; i++) {
long usedCurrent = root.get(i).get("used").asLong();
Expand Down
Loading