Skip to content

Commit 3a1b4d2

Browse files
authored
[#1598] fix(server) Fix inaccurate used_direct_memory_size metric (#1599)
### What changes were proposed in this pull request? Fix the inaccurate `used_direct_memory_size` metric. Also `used_direct_memory_size_by_netty` and `used_direct_memory_size_by_grpc_netty` metrics are added to provide more detailed indicators for locating and analyzing in production. ### Why are the changes needed? Fix #1598. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested in our env.
1 parent 220d9cb commit 3a1b4d2

File tree

2 files changed

+21
-5
lines changed

2 files changed

+21
-5
lines changed

server/src/main/java/org/apache/uniffle/server/NettyDirectMemoryTracker.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,20 @@ public void start() {
5353
service.scheduleAtFixedRate(
5454
() -> {
5555
try {
56-
long usedDirectMemory = PlatformDependent.usedDirectMemory();
56+
long usedDirectMemoryByNetty = PlatformDependent.usedDirectMemory();
57+
long usedDirectMemoryByGrpcNetty =
58+
io.grpc.netty.shaded.io.netty.util.internal.PlatformDependent.usedDirectMemory();
5759
if (LOG.isDebugEnabled()) {
58-
LOG.debug("Current usedDirectMemory:{}", usedDirectMemory);
60+
LOG.debug(
61+
"Current usedDirectMemoryByNetty:{}, usedDirectMemoryByGrpcNetty:{}",
62+
usedDirectMemoryByNetty,
63+
usedDirectMemoryByGrpcNetty);
5964
}
60-
ShuffleServerMetrics.gaugeUsedDirectMemorySize.set(usedDirectMemory);
65+
ShuffleServerMetrics.gaugeUsedDirectMemorySizeByNetty.set(usedDirectMemoryByNetty);
66+
ShuffleServerMetrics.gaugeUsedDirectMemorySizeByGrpcNetty.set(
67+
usedDirectMemoryByGrpcNetty);
68+
ShuffleServerMetrics.gaugeUsedDirectMemorySize.set(
69+
usedDirectMemoryByNetty + usedDirectMemoryByGrpcNetty);
6170
} catch (Throwable t) {
6271
LOG.error("Failed to report direct memory.", t);
6372
}

server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,9 @@ public class ShuffleServerMetrics {
7979
private static final String USED_BUFFER_SIZE = "used_buffer_size";
8080
private static final String READ_USED_BUFFER_SIZE = "read_used_buffer_size";
8181
private static final String USED_DIRECT_MEMORY_SIZE = "used_direct_memory_size";
82-
private static final String ALLOCATED_DIRECT_MEMORY_SIZE = "allocated_direct_memory_size";
83-
private static final String PINNED_DIRECT_MEMORY_SIZE = "pinned_direct_memory_size";
82+
private static final String USED_DIRECT_MEMORY_SIZE_BY_NETTY = "used_direct_memory_size_by_netty";
83+
private static final String USED_DIRECT_MEMORY_SIZE_BY_GRPC_NETTY =
84+
"used_direct_memory_size_by_grpc_netty";
8485
private static final String TOTAL_FAILED_WRITTEN_EVENT_NUM = "total_failed_written_event_num";
8586
private static final String TOTAL_DROPPED_EVENT_NUM = "total_dropped_event_num";
8687
private static final String TOTAL_HADOOP_WRITE_DATA = "total_hadoop_write_data";
@@ -186,6 +187,8 @@ public class ShuffleServerMetrics {
186187
public static Gauge.Child gaugeUsedBufferSize;
187188
public static Gauge.Child gaugeReadBufferUsedSize;
188189
public static Gauge.Child gaugeUsedDirectMemorySize;
190+
public static Gauge.Child gaugeUsedDirectMemorySizeByNetty;
191+
public static Gauge.Child gaugeUsedDirectMemorySizeByGrpcNetty;
189192
public static Gauge.Child gaugeWriteHandler;
190193
public static Gauge.Child gaugeEventQueueSize;
191194
public static Gauge.Child gaugeHadoopFlushThreadPoolQueueSize;
@@ -382,6 +385,10 @@ private static void setUpMetrics() {
382385
gaugeUsedBufferSize = metricsManager.addLabeledGauge(USED_BUFFER_SIZE);
383386
gaugeReadBufferUsedSize = metricsManager.addLabeledGauge(READ_USED_BUFFER_SIZE);
384387
gaugeUsedDirectMemorySize = metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE);
388+
gaugeUsedDirectMemorySizeByNetty =
389+
metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE_BY_NETTY);
390+
gaugeUsedDirectMemorySizeByGrpcNetty =
391+
metricsManager.addLabeledGauge(USED_DIRECT_MEMORY_SIZE_BY_GRPC_NETTY);
385392
gaugeWriteHandler = metricsManager.addLabeledGauge(TOTAL_WRITE_HANDLER);
386393
gaugeEventQueueSize = metricsManager.addLabeledGauge(EVENT_QUEUE_SIZE);
387394
gaugeHadoopFlushThreadPoolQueueSize =

0 commit comments

Comments
 (0)