Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hadoop-hdds/interface-client/src/main/proto/hdds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ message Node {
repeated NodeOperationalState nodeOperationalStates = 3;
optional int32 totalVolumeCount = 4;
optional int32 healthyVolumeCount = 5;
repeated string failedVolumes = 6;
}

message NodePool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ public void getMetrics(MetricsCollector collector, boolean all) {
Integer.parseInt(nonWritableNodes));
}

String volumeFailures = nodeStatistics.get("VolumeFailures");
if (volumeFailures != null) {
metrics.addGauge(
Interns.info("VolumeFailures",
"Number of datanodes with at least one failed volume"),
Integer.parseInt(volumeFailures));
}

for (Map.Entry<String, Long> e : nodeInfo.entrySet()) {
metrics.addGauge(
Interns.info(e.getKey(), diskMetricDescription(e.getKey())),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionSummary;
import org.apache.hadoop.hdds.protocol.proto.ReconfigureProtocolProtos.ReconfigureProtocolService;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.StorageReportProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
Expand Down Expand Up @@ -655,6 +656,7 @@ public List<HddsProtos.Node> queryNode(
if (datanodeInfo != null) {
nodeBuilder.setTotalVolumeCount(datanodeInfo.getStorageReports().size());
nodeBuilder.setHealthyVolumeCount(datanodeInfo.getHealthyVolumeCount());
addFailedVolumes(nodeBuilder, datanodeInfo);
}
result.add(nodeBuilder.build());
}
Expand Down Expand Up @@ -687,6 +689,7 @@ public HddsProtos.Node queryNode(UUID uuid)
if (datanodeInfo != null) {
nodeBuilder.setTotalVolumeCount(datanodeInfo.getStorageReports().size());
nodeBuilder.setHealthyVolumeCount(datanodeInfo.getHealthyVolumeCount());
addFailedVolumes(nodeBuilder, datanodeInfo);
}
result = nodeBuilder.build();
}
Expand All @@ -702,6 +705,15 @@ public HddsProtos.Node queryNode(UUID uuid)
return result;
}

private static void addFailedVolumes(HddsProtos.Node.Builder nodeBuilder,
DatanodeInfo datanodeInfo) {
for (StorageReportProto report : datanodeInfo.getStorageReports()) {
if (report.hasFailed() && report.getFailed()) {
nodeBuilder.addFailedVolumes(report.getStorageLocation());
}
}
}

@Override
public List<DatanodeAdminError> decommissionNodes(List<String> nodes, boolean force)
throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
Expand All @@ -41,6 +43,8 @@ public final class BasicDatanodeInfo {
private Integer totalVolumeCount = null;
@JsonInclude(JsonInclude.Include.NON_NULL)
private Integer healthyVolumeCount = null;
@JsonInclude(JsonInclude.Include.NON_EMPTY)
private List<String> failedVolumes = null;

private BasicDatanodeInfo(Builder builder) {
this.dn = builder.dn;
Expand All @@ -51,26 +55,30 @@ private BasicDatanodeInfo(Builder builder) {
this.percentUsed = builder.percentUsed;
this.totalVolumeCount = builder.totalVolumeCount;
this.healthyVolumeCount = builder.healthyVolumeCount;
this.failedVolumes = builder.failedVolumes;
}

/**
* Builder class for creating instances of BasicDatanodeInfo.
*/
public static class Builder {
private DatanodeDetails dn;
private HddsProtos.NodeOperationalState opState;
private HddsProtos.NodeState healthState;
private final DatanodeDetails dn;
private final HddsProtos.NodeOperationalState opState;
private final HddsProtos.NodeState healthState;
private Long used;
private Long capacity;
private Double percentUsed;
private Integer totalVolumeCount;
private Integer healthyVolumeCount;

public Builder(DatanodeDetails dn, HddsProtos.NodeOperationalState opState,
HddsProtos.NodeState healthState) {
this.dn = dn;
this.opState = opState;
this.healthState = healthState;
private final Integer totalVolumeCount;
private final Integer healthyVolumeCount;
private final List<String> failedVolumes;

public Builder(HddsProtos.Node node) {
dn = DatanodeDetails.getFromProtoBuf(node.getNodeID());
healthState = node.getNodeStates(0);
opState = node.getNodeOperationalStates(0);
totalVolumeCount = node.hasTotalVolumeCount() ? node.getTotalVolumeCount() : null;
healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
failedVolumes = getFailedVolumes(node);
}

public Builder withUsageInfo(long usedBytes, long capacityBytes, double percentUsedBytes) {
Expand All @@ -80,12 +88,6 @@ public Builder withUsageInfo(long usedBytes, long capacityBytes, double percentU
return this;
}

public Builder withVolumeCounts(Integer total, Integer healthy) {
this.totalVolumeCount = total;
this.healthyVolumeCount = healthy;
return this;
}

public BasicDatanodeInfo build() {
return new BasicDatanodeInfo(this);
}
Expand Down Expand Up @@ -206,8 +208,26 @@ public Integer getHealthyVolumeCount() {
return healthyVolumeCount;
}

@JsonProperty(index = 112)
public List<String> getFailedVolumes() {
return failedVolumes;
}

@JsonIgnore
public DatanodeDetails getDatanodeDetails() {
return dn;
}

private static List<String> getFailedVolumes(HddsProtos.Node node) {
int count = node.getFailedVolumesCount();
if (count == 0) {
return Collections.emptyList();
}
List<String> result = new ArrayList<>(count);
for (int i = 0; i < count; i++) {
result.add(node.getFailedVolumes(i));
}
return result;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ public class ListInfoSubcommand extends ScmSubcommand {
defaultValue = "false")
private boolean json;

@CommandLine.Option(names = {"--nodes-with-failed-volumes"},
description = "Only show datanodes that have at least one failed volume.",
defaultValue = "false")
private boolean nodeWithFailedVolumes;

Comment on lines +68 to +72
Copy link
Copy Markdown
Contributor

@sreejasahithi sreejasahithi Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The --nodes-with-failed-volumes filter is silently ignored when --node-id is used.
We should make these options mutually exclusive to avoid confusion when a user provides both in the command but receives the result for the node whose ID was specified, regardless of whether it has failed volumes.

@CommandLine.ArgGroup(exclusive = true, multiplicity = "0..1")
private ExclusiveNodeOptions exclusiveNodeOptions;

Expand All @@ -88,11 +93,7 @@ public void execute(ScmClient scmClient) throws IOException {
pipelines = scmClient.listPipelines();
if (exclusiveNodeOptions != null && !Strings.isNullOrEmpty(exclusiveNodeOptions.getNodeId())) {
HddsProtos.Node node = scmClient.queryNode(UUID.fromString(exclusiveNodeOptions.getNodeId()));
Integer totalVolumeCount = node.hasTotalVolumeCount() ? node.getTotalVolumeCount() : null;
Integer healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
BasicDatanodeInfo singleNodeInfo = new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(node.getNodeID()), node.getNodeOperationalStates(0),
node.getNodeStates(0)).withVolumeCounts(totalVolumeCount, healthyVolumeCount).build();
BasicDatanodeInfo singleNodeInfo = new BasicDatanodeInfo.Builder(node).build();
if (json) {
List<BasicDatanodeInfo> dtoList = Collections.singletonList(singleNodeInfo);
System.out.println(JsonUtils.toJsonStringWithDefaultPrettyPrinter(dtoList));
Expand All @@ -118,6 +119,10 @@ public void execute(ScmClient scmClient) throws IOException {
allNodes = allNodes.filter(p -> p.getHealthState().toString()
.compareToIgnoreCase(nodeState) == 0);
}
if (nodeWithFailedVolumes) {
allNodes = allNodes.filter(p ->
p.getFailedVolumes() != null && !p.getFailedVolumes().isEmpty());
}

if (!listLimitOptions.isAll()) {
allNodes = allNodes.limit(listLimitOptions.getLimit());
Expand Down Expand Up @@ -154,13 +159,9 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient)
long capacity = p.getCapacity();
long used = capacity - p.getRemaining();
double percentUsed = (capacity > 0) ? (used * 100.0) / capacity : 0.0;
Integer totalVolumeCount = node.hasTotalVolumeCount() ? node.getTotalVolumeCount() : null;
Integer healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
return new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(node.getNodeID()),
node.getNodeOperationalStates(0), node.getNodeStates(0))
return new BasicDatanodeInfo.Builder(node)
.withUsageInfo(used, capacity, percentUsed)
.withVolumeCounts(totalVolumeCount, healthyVolumeCount).build();
.build();
} catch (Exception e) {
String reason = "Could not process info for an unknown datanode";
if (p != null && p.getNode() != null && !Strings.isNullOrEmpty(p.getNode().getUuid())) {
Expand All @@ -177,12 +178,7 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient)
List<HddsProtos.Node> nodes = scmClient.queryNode(null,
null, HddsProtos.QueryScope.CLUSTER, "");

return nodes.stream().map(p -> {
Integer totalVolumeCount = p.hasTotalVolumeCount() ? p.getTotalVolumeCount() : null;
Integer healthyVolumeCount = p.hasHealthyVolumeCount() ? p.getHealthyVolumeCount() : null;
return new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(p.getNodeID()), p.getNodeOperationalStates(0), p.getNodeStates(0))
.withVolumeCounts(totalVolumeCount, healthyVolumeCount).build(); })
return nodes.stream().map(p -> new BasicDatanodeInfo.Builder(p).build())
.sorted(Comparator.comparing(BasicDatanodeInfo::getHealthState))
.collect(Collectors.toList());
}
Expand All @@ -206,10 +202,12 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) {
.append('/').append(p.getPipelineState().toString()).append('/')
.append(datanode.getID().equals(p.getLeaderId()) ?
"Leader" : "Follower")
.append(System.getProperty("line.separator")));
.append('\n'));
}
} else {
pipelineListInfo.append("No pipelines in cluster.");
pipelineListInfo
.append("No pipelines in cluster.")
.append('\n');
}
System.out.println("Datanode: " + datanode.getUuid().toString() +
" (" + datanode.getNetworkLocation() + "/" + datanode.getIpAddress()
Expand All @@ -221,6 +219,12 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) {
System.out.println("Total volume count: " + dn.getTotalVolumeCount() + "\n" +
"Healthy volume count: " + dn.getHealthyVolumeCount());
}
if (dn.getFailedVolumes() != null && !dn.getFailedVolumes().isEmpty()) {
System.out.println("Failed volumes:");
for (String vol : dn.getFailedVolumes()) {
System.out.println(" " + vol);
}
}
System.out.println("Related pipelines:\n" + pipelineListInfo);

if (dn.getUsed() != null && dn.getCapacity() != null && dn.getUsed() >= 0 && dn.getCapacity() > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.hadoop.hdds.scm.cli.datanode;

import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
Expand Down Expand Up @@ -357,6 +358,50 @@ public void testVolumeCounters() throws Exception {
assertTrue(output.contains("Healthy volume count:"), "Should display healthy volume count");
}

@Test
public void testFailedVolumesFilter() throws Exception {
ScmClient scmClient = mock(ScmClient.class);
List<HddsProtos.Node> baseNodes = getNodeDetails();

List<HddsProtos.Node> nodes = new ArrayList<>();
// node 0: 1 failed volume
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(0))
.setTotalVolumeCount(4).setHealthyVolumeCount(3)
.addFailedVolumes("/data/disk2").build());
// node 1: healthy, no failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(1))
.setTotalVolumeCount(4).setHealthyVolumeCount(4).build());
// node 2: 2 failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(2))
.setTotalVolumeCount(6).setHealthyVolumeCount(4)
.addFailedVolumes("/data/disk1")
.addFailedVolumes("/data/disk5").build());
// node 3: healthy, no failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(3))
.setTotalVolumeCount(4).setHealthyVolumeCount(4).build());

when(scmClient.queryNode(any(), any(), any(), any())).thenReturn(nodes);
when(scmClient.listPipelines()).thenReturn(new ArrayList<>());

CommandLine c = new CommandLine(cmd);
c.parseArgs("--nodes-with-failed-volumes");
cmd.execute(scmClient);
String output = outContent.toString(DEFAULT_ENCODING);

// Only 2 datanodes (those with failed volumes) should appear
Matcher m = Pattern.compile("^Datanode:", Pattern.MULTILINE)
.matcher(output);
int count = 0;
while (m.find()) {
count++;
}
assertEquals(2, count, "Only datanodes with failed volumes should be listed");
assertThat(output).contains("Failed volume");
assertThat(output).contains("/data/disk2");
assertThat(output).contains("/data/disk1");
assertThat(output).contains("/data/disk5");
}

private void validateOrdering(JsonNode root, String orderDirection) {
for (int i = 0; i < root.size() - 1; i++) {
long usedCurrent = root.get(i).get("used").asLong();
Expand Down
Loading