-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Open
Description
As seen at b/453986718, with grpc-java 1.75.0.
Caused by: java.lang.NullPointerException: Cannot invoke "io.grpc.xds.client.LoadStatsManager2.getClusterDropStats(String, String)" because "loadStatsManager" is null
at io.grpc.xds.client.XdsClientImpl.addClusterDropStats(XdsClientImpl.java:398)
at io.grpc.xds.ClusterImplLoadBalancer.acceptResolvedAddresses(ClusterImplLoadBalancer.java:143)
at io.grpc.util.GracefulSwitchLoadBalancer.acceptResolvedAddresses(GracefulSwitchLoadBalancer.java:101)
at io.grpc.xds.PriorityLoadBalancer$ChildLbState.updateResolvedAddresses(PriorityLoadBalancer.java:296)
at io.grpc.xds.PriorityLoadBalancer.tryNextPriority(PriorityLoadBalancer.java:162)
at io.grpc.xds.PriorityLoadBalancer.access$300(PriorityLoadBalancer.java:53)
at io.grpc.xds.PriorityLoadBalancer$ChildLbState$ChildHelper.updateBalancingState(PriorityLoadBalancer.java:346)
at io.grpc.util.GracefulSwitchLoadBalancer$1PendingHelper.updateBalancingState(GracefulSwitchLoadBalancer.java:145)
at io.grpc.xds.ClusterImplLoadBalancer$ClusterImplLbHelper.updateBalancingState(ClusterImplLoadBalancer.java:223)
at io.grpc.util.GracefulSwitchLoadBalancer$1PendingHelper.updateBalancingState(GracefulSwitchLoadBalancer.java:145)
at io.grpc.util.GracefulSwitchLoadBalancer$1PendingHelper.updateBalancingState(GracefulSwitchLoadBalancer.java:145)
at io.grpc.xds.WeightedTargetLoadBalancer.updateOverallBalancingState(WeightedTargetLoadBalancer.java:173)
at io.grpc.xds.WeightedTargetLoadBalancer.access$300(WeightedTargetLoadBalancer.java:45)
at io.grpc.xds.WeightedTargetLoadBalancer$ChildHelper.updateBalancingState(WeightedTargetLoadBalancer.java:213)
at io.grpc.util.GracefulSwitchLoadBalancer$1PendingHelper.updateBalancingState(GracefulSwitchLoadBalancer.java:145)
at io.grpc.util.ForwardingLoadBalancerHelper.updateBalancingState(ForwardingLoadBalancerHelper.java:90)
at io.grpc.util.ForwardingLoadBalancerHelper.updateBalancingState(ForwardingLoadBalancerHelper.java:90)
at io.grpc.util.RoundRobinLoadBalancer.updateBalancingState(RoundRobinLoadBalancer.java:82)
at io.grpc.util.RoundRobinLoadBalancer.updateOverallBalancingState(RoundRobinLoadBalancer.java:73)
at io.grpc.util.MultiChildLoadBalancer$ChildLbState$ChildLbStateHelper.updateBalancingState(MultiChildLoadBalancer.java:349)
at io.grpc.util.RoundRobinLoadBalancer$1$1.updateBalancingState(RoundRobinLoadBalancer.java:106)
at io.grpc.internal.PickFirstLoadBalancer.updateBalancingState(PickFirstLoadBalancer.java:159)
at io.grpc.internal.PickFirstLoadBalancer.processSubchannelState(PickFirstLoadBalancer.java:154)
at io.grpc.internal.PickFirstLoadBalancer.access$000(PickFirstLoadBalancer.java:43)
at io.grpc.internal.PickFirstLoadBalancer$1.onSubchannelState(PickFirstLoadBalancer.java:83)
at io.grpc.protobuf.services.HealthCheckingLoadBalancerFactory$HealthCheckState.gotoState(HealthCheckingLoadBalancerFactory.java:358)
at io.grpc.protobuf.services.HealthCheckingLoadBalancerFactory$HealthCheckState.adjustHealthCheck(HealthCheckingLoadBalancerFactory.java:321)
at io.grpc.protobuf.services.HealthCheckingLoadBalancerFactory$HealthCheckState.onSubchannelState(HealthCheckingLoadBalancerFactory.java:301)
at io.grpc.xds.ClusterImplLoadBalancer$ClusterImplLbHelper$1$1.onSubchannelState(ClusterImplLoadBalancer.java:267)
at io.grpc.internal.ManagedChannelImpl$SubchannelImpl$1ManagedInternalSubchannelCallback.onStateChange(ManagedChannelImpl.java:1869)
at io.grpc.internal.InternalSubchannel.gotoState(InternalSubchannel.java:355)
at io.grpc.internal.InternalSubchannel.scheduleBackoff(InternalSubchannel.java:296)
at io.grpc.internal.InternalSubchannel.access$2700(InternalSubchannel.java:68)
at io.grpc.internal.InternalSubchannel$TransportListener$2.run(InternalSubchannel.java:631)
at io.grpc.SynchronizationContext.drain(SynchronizationContext.java:96)
at io.grpc.SynchronizationContext.execute(SynchronizationContext.java:128)
at io.grpc.internal.InternalSubchannel$TransportListener.transportShutdown(InternalSubchannel.java:611)
at io.grpc.netty.shaded.io.grpc.netty.ClientTransportLifecycleManager.notifyGracefulShutdown(ClientTransportLifecycleManager.java:63)
at io.grpc.netty.shaded.io.grpc.netty.ClientTransportLifecycleManager.notifyShutdown(ClientTransportLifecycleManager.java:69)
at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler.onConnectionError(NettyClientHandler.java:563)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2ConnectionHandler.onError(Http2ConnectionHandler.java:652)
at io.grpc.netty.shaded.io.grpc.netty.AbstractNettyHandler.exceptionCaught(AbstractNettyHandler.java:105)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:346)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:325)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:317)
at io.grpc.netty.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.exceptionCaught(DefaultChannelPipeline.java:1324)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:346)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:325)
at io.grpc.netty.shaded.io.netty.channel.DefaultChannelPipeline.fireExceptionCaught(DefaultChannelPipeline.java:856)
at io.grpc.netty.shaded.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.handleReadException(AbstractEpollStreamChannel.java:727)
at io.grpc.netty.shaded.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady(AbstractEpollStreamChannel.java:825)
at io.grpc.netty.shaded.io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:501)
at io.grpc.netty.shaded.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:399)
at io.grpc.netty.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
at io.grpc.netty.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at io.grpc.netty.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
... 1 more
Looking at addClusterLocalityStats(), I'm thinking this was introduced resolving a federation problem at 6d75fca .
This was triggered on creation of the ClusterImplLoadBalancer, but that had been delayed by being a lower priority. loadStatsManagerMap is added to, but never removed to (seems it leaks). So this isn't an ordering problem. loadStatsManagerMap doesn't appear to be properly synchronized.
I see only two ways this could have happened:
- the ServerInfo in ClusterImplLB was gotten from somewhere random and didn't relate to this XdsClientImpl instance
- a new ControlPlaneClient was being created at the same time, caused the hash map to be modified, and the map mistakenly returning null. I could see the map returning null if the hash array was resized (which would have required this to be the 4th, or maybe 3rd, entry added) or a hash collision with a new entry (so
nextwas not yet updated in HashMap.Node).
Overall, seems this code needs a deep lookover to avoid leaks, races, and lack of synchronization.