-
Notifications
You must be signed in to change notification settings - Fork 312
Description
Tracer Version(s)
1.52.1
Java Version(s)
amazoncorretto:22-al2023 (docker base image)
JVM Vendor
Amazon Corretto
Bug Report
Note: This bug report was prepared with assistance from Claude AI for analysis and documentation.
A NullPointerException
occurs in ScopeStack.checkOverdueScopes()
method due to a race condition on the top
field access.
Despite having a null check (if (top == null || top.source() != ITERATION)
), the NPE still happens because the top
field is not declared as volatile
, allowing other threads to modify it between the null check and the subsequent method call.
Stack Trace
java.lang.NullPointerException: Cannot invoke "datadog.trace.agent.core.scopemanager.ContinuableScope.source()" because "this.top" is null
at datadog.trace.agent.core.scopemanager.ScopeStack.checkOverdueScopes(ScopeStack.java:90)
at datadog.trace.agent.core.scopemanager.ContinuableScope.close(ContinuableScope.java:50)
at datadog.trace.instrumentation.springdata.RepositoryInterceptor.invoke(RepositoryInterceptor.java:48)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:184)
at org.springframework.aop.framework.JdkDynamicAopProxy.invoke(JdkDynamicAopProxy.java:223)
at jdk.proxy3.$Proxy162.findAllActiveByPermissions
at jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
at java.lang.reflect.Method.invoke(Method.java:580)
at kotlin.reflect.jvm.internal.calls.CallerImpl$Method.callMethod(CallerImpl.kt:97)
at kotlin.reflect.jvm.internal.calls.CallerImpl$Method$Instance.call(CallerImpl.kt:113)
at kotlin.reflect.jvm.internal.KCallableImpl.callDefaultMethod$kotlin_reflection(KCallableImpl.kt:207)
at kotlin.reflect.full.KCallables.callSuspendBy(KCallables.kt:74)
at org.springframework.core.CoroutinesUtils.lambda$invokeSuspendingFunction$3(CoroutinesUtils.java:143)
at kotlin.coroutines.intrinsics.IntrinsicsKt__IntrinsicsJvmKt$createCoroutineUnintercepted$$inlined$createCoroutineFromSuspendFunction$IntrinsicsKt__IntrinsicsJvmKt$4.invokeSuspend(IntrinsicsJvm.kt:270)
at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:104)
at kotlinx.coroutines.EventLoop.processUnconfinedEvent(EventLoop.common.kt:65)
at kotlinx.coroutines.DispatchedTaskKt.resumeUnconfined(DispatchedTask.kt:241)
at kotlinx.coroutines.DispatchedTaskKt.dispatch(DispatchedTask.kt:159)
at kotlinx.coroutines.CancellableContinuationImpl.dispatchResume(CancellableContinuationImpl.kt:466)
at kotlinx.coroutines.CancellableContinuationImpl.resumeImpl(CancellableContinuationImpl.kt:500)
at kotlinx.coroutines.CancellableContinuationImpl.resumeImpl$default(CancellableContinuationImpl.kt:489)
at kotlinx.coroutines.CancellableContinuationImpl.resumeWith(CancellableContinuationImpl.kt:364)
at kotlinx.coroutines.reactor.MonoKt$awaitSingleOrNull$2$1.onComplete(Mono.kt:52)
at reactor.core.publisher.StrictSubscriber.onComplete(StrictSubscriber.java:123)
at reactor.core.publisher.FluxContextWrite$ContextWriteSubscriber.onComplete(FluxContextWrite.java:126)
at reactor.core.publisher.MonoNext$NextSubscriber.onComplete(MonoNext.java:102)
at reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:83)
at reactor.core.publisher.FluxUsingWhen$UsingWhenSubscriber.onNext(FluxUsingWhen.java:348)
at reactor.core.publisher.FluxMap$MapSubscriber.onNext(FluxMap.java:122)
at reactor.core.publisher.FluxMap$MapSubscriber.onNext(FluxMap.java:122)
at reactor.core.publisher.FluxFilter$FilterSubscriber.onNext(FluxFilter.java:113)
at reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:82)
at reactor.core.publisher.FluxOnErrorResume$ResumeSubscriber.onNext(FluxOnErrorResume.java:79)
at reactor.core.publisher.MonoFlatMapMany$FlatMapManyInner.onNext(MonoFlatMapMany.java:251)
at reactor.core.publisher.FluxDefaultIfEmpty$DefaultIfEmptySubscriber.onNext(FluxDefaultIfEmpty.java:122)
at reactor.core.publisher.FluxMap$MapSubscriber.onNext(FluxMap.java:122)
at reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:82)
at reactor.core.publisher.MonoNext$NextSubscriber.onNext(MonoNext.java:82)
at io.lettuce.core.RedisPublisher$ImmediateSubscriber.onNext(RedisPublisher.java:895)
at io.lettuce.core.RedisPublisher$RedisSubscription.onNext(RedisPublisher.java:295)
at io.lettuce.core.RedisPublisher$SubscriptionCommand.doOnComplete(RedisPublisher.java:782)
at io.lettuce.core.protocol.CommandWrapper.complete(CommandWrapper.java:69)
at io.lettuce.core.protocol.CommandWrapper.complete(CommandWrapper.java:67)
at io.lettuce.core.protocol.CommandHandler.complete(CommandHandler.java:769)
at io.lettuce.core.protocol.CommandHandler.decode(CommandHandler.java:704)
at io.lettuce.core.protocol.CommandHandler.channelRead(CommandHandler.java:621)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
at io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1519)
at io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1377)
at io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1428)
at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:530)
at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:469)
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:290)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:868)
at io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady(AbstractEpollStreamChannel.java:799)
at io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:501)
at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.lang.Thread.run(Thread.java:1570)
Root Cause
The issue occurs due to a race condition in multi-threaded environments:
- Thread A evaluates the null check and finds
top
is not null - Thread B concurrently executes
cleanup()
method and setstop = null
- Thread A then calls
top.source()
on a now-null reference - NPE is thrown
Impact
This race condition can cause unexpected crashes in applications using dd-trace-java in multi-threaded scenarios, particularly when scope management operations occur concurrently.
Expected Behavior
Expected Behavior
The checkOverdueScopes()
method should handle concurrent access safely without throwing NullPointerException
. The method should either:
-
Successfully complete the null check and proceed: When
top
is not null and is an ITERATION scope, the method should safely accesstop.source()
and continue processing overdue scopes. -
Return false safely: When
top
is null or not an ITERATION scope, the method should return false without any exceptions. -
Maintain thread safety: The method should work correctly in multi-threaded environments where multiple threads may be concurrently modifying the scope stack through operations like
cleanup()
,push()
, or scope closure.
The scope management should be robust and not crash the application due to race conditions between threads performing legitimate scope operations.
Reproduction Code
The race condition is difficult to reproduce consistently due to its timing-sensitive nature, but here's a test case that can help trigger the issue:
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
public class ScopeStackRaceConditionTest {
@Test
public void testConcurrentScopeOperations() throws InterruptedException {
final int THREAD_COUNT = 100;
final int ITERATIONS = 1000;
final CountDownLatch startLatch = new CountDownLatch(1);
final CountDownLatch doneLatch = new CountDownLatch(THREAD_COUNT);
final AtomicInteger exceptionCount = new AtomicInteger(0);
ContinuableScopeManager scopeManager = new ContinuableScopeManager();
ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
// Create multiple threads that concurrently manipulate scopes
for (int i = 0; i < THREAD_COUNT; i++) {
executor.submit(() -> {
try {
startLatch.await();
for (int j = 0; j < ITERATIONS; j++) {
try {
// Create and close scopes rapidly
AgentSpan span = GlobalTracer.get().buildSpan("test-span").start();
ContinuableScope scope = (ContinuableScope) scopeManager.activate(span);
// This can trigger the race condition
scope.close();
} catch (NullPointerException e) {
if (e.getMessage() != null &&
e.getMessage().contains("Cannot invoke") &&
e.getMessage().contains("source()") &&
e.getMessage().contains("this.top")) {
exceptionCount.incrementAndGet();
}
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
doneLatch.countDown();
}
});
}
// Start all threads simultaneously
startLatch.countDown();
doneLatch.await();
executor.shutdown();
// The NPE should be reproduced if the race condition occurs
System.out.println("NPE occurrences: " + exceptionCount.get());
}
}
Concurrent scope operations that can trigger the race condition
public void triggerRaceCondition() {
ContinuableScopeManager scopeManager = new ContinuableScopeManager();
// Thread 1: Continuously create and close scopes
Thread scopeThread = new Thread(() -> {
for (int i = 0; i < 10000; i++) {
AgentSpan span = GlobalTracer.get().buildSpan("iteration-" + i).start();
ContinuableScope scope = (ContinuableScope) scopeManager.activate(span);
scope.close(); // This calls checkOverdueScopes()
}
});
// Thread 2: Continuously trigger cleanup
Thread cleanupThread = new Thread(() -> {
for (int i = 0; i < 10000; i++) {
// Access scope stack to trigger potential cleanup
scopeManager.activeScope();
}
});
scopeThread.start();
cleanupThread.start();
// Wait for threads to complete
try {
scopeThread.join();
cleanupThread.join();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}