Skip to content

[BUG] rdb_bgsave_in_progress since a long time, aof file growing non-stop #675

@zas

Description

@zas

Describe the bug

We are running a master<->master keydb setup, using keydb docker images from eqalpha/keydb:x86_64_v6.3.3

After some time without any problem, AOF file starts to grow on ONE instance.

Currently that's the instance where most clients connect, but we had the issue with other some time ago.

image

Here is info the the instance with growing AOF (rex):

127.0.0.1:13062> info 
# Server
redis_version:6.3.3
redis_git_sha1:86ea5c7e
redis_git_dirty:1
redis_build_id:942bbf4eda83edde
redis_mode:standalone
os:Linux 5.15.0-73-generic x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:7.5.0
process_id:1
process_supervised:no
run_id:5071892d308a75b7d310416a50939c495573149a
tcp_port:6379
server_time_usec:1686815422514961
uptime_in_seconds:253525
uptime_in_days:2
hz:10
configured_hz:10
lru_clock:9093822
executable:/data/keydb-server
config_file:

# Clients
connected_clients:41
cluster_connections:0
maxclients:10000
client_recent_max_input_buffer:48
client_recent_max_output_buffer:0
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:1
thread_0_clients:19
thread_1_clients:20
thread_2_clients:3
thread_3_clients:0

# Memory
used_memory:7533296
used_memory_human:7.18M
used_memory_rss:20541440
used_memory_rss_human:19.59M
used_memory_peak:20649880
used_memory_peak_human:19.69M
used_memory_peak_perc:36.48%
used_memory_overhead:5439192
used_memory_startup:3216024
used_memory_dataset:2094104
used_memory_dataset_perc:48.51%
allocator_allocated:8318968
allocator_active:10436608
allocator_resident:15495168
total_system_memory:67353542656
total_system_memory_human:62.73G
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:0
maxmemory_human:0B
maxmemory_policy:noeviction
allocator_frag_ratio:1.25
allocator_frag_bytes:2117640
allocator_rss_ratio:1.48
allocator_rss_bytes:5058560
rss_overhead_ratio:1.33
rss_overhead_bytes:5046272
mem_fragmentation_ratio:2.74
mem_fragmentation_bytes:13050456
mem_not_counted_for_evict:3066
mem_replication_backlog:1048576
mem_clients_slaves:20512
mem_clients_normal:799672
mem_aof_buffer:3072
mem_allocator:jemalloc-5.2.1
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
storage_provider:none

# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:3160
rdb_changes_since_last_save:14470141
rdb_bgsave_in_progress:1
rdb_last_save_time:1686789002
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:0
rdb_current_bgsave_time_sec:26359
rdb_last_cow_size:1380352
aof_enabled:1
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:3182592
module_fork_in_progress:0
module_fork_last_cow_size:0
aof_current_size:690452188
aof_base_size:511532
aof_pending_rewrite:0
aof_buffer_length:0
aof_rewrite_buffer_length:0
aof_pending_bio_fsync:0
aof_delayed_fsync:0

# Stats
total_connections_received:1354187
total_commands_processed:577223792
instantaneous_ops_per_sec:2118
total_net_input_bytes:50324881019
total_net_output_bytes:24729808346
instantaneous_input_kbps:180.95
instantaneous_output_kbps:89.12
rejected_connections:0
sync_full:1
sync_partial_ok:0
sync_partial_err:1
expired_keys:28064642
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:0
evicted_keys:0
keyspace_hits:101900287
keyspace_misses:12669
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:1581
total_forks:3827
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:60
total_error_replies:61
dump_payload_sanitizations:0
total_reads_processed:415062907
total_writes_processed:419483413
instantaneous_lock_contention:1
avg_lock_contention:0.250000
storage_provider_read_hits:0
storage_provider_read_misses:0

# Replication
role:active-replica
master_global_link_status:up
connected_masters:1
master_host:10.2.2.30
master_port:13062
master_link_status:up
master_last_io_seconds_ago:0
master_sync_in_progress:0
slave_read_repl_offset:39291105223
slave_repl_offset:39291105223
slave_priority:100
slave_read_only:0
replica_announced:1
connected_slaves:1
slave0:ip=172.17.0.1,port=6379,state=online,offset=23217116899,lag=0
master_failover_state:no-failover
master_replid:d5c363b90cc35be7739e4522d023993b0867b7e7
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:23217153834
second_repl_offset:-1
repl_backlog_active:1
repl_backlog_size:1048576
repl_backlog_first_byte_offset:23216105259
repl_backlog_histlen:1048576

# CPU
used_cpu_sys:16727.780803
used_cpu_user:12209.666885
used_cpu_sys_children:17.220899
used_cpu_user_children:35.933273
server_threads:4
long_lock_waits:66
used_cpu_sys_main_thread:4372.172020
used_cpu_user_main_thread:2987.077438

# Modules

# Errorstats
errorstat_ERR:count=60
errorstat_LOADING:count=1

# Cluster
cluster_enabled:0

# Keyspace
db0:keys=4010,expires=2494,avg_ttl=14,cached_keys=4010

# KeyDB
mvcc_depth:0

Here is the info output for the instance without issue (rudi):

127.0.0.1:13062> info 
# Server
redis_version:6.3.3
redis_git_sha1:86ea5c7e
redis_git_dirty:1
redis_build_id:942bbf4eda83edde
redis_mode:standalone
os:Linux 5.15.0-73-generic x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:7.5.0
process_id:1
process_supervised:no
run_id:6aa098fbd1e98ed7a14d6ceb3e9f4e6099427cf5
tcp_port:6379
server_time_usec:1686815685014798
uptime_in_seconds:254474
uptime_in_days:2
hz:10
configured_hz:10
lru_clock:9094084
executable:/data/keydb-server
config_file:

# Clients
connected_clients:3
cluster_connections:0
maxclients:10000
client_recent_max_input_buffer:181
client_recent_max_output_buffer:0
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:0
thread_0_clients:4
thread_1_clients:0
thread_2_clients:0
thread_3_clients:0

# Memory
used_memory:5563704
used_memory_human:5.31M
used_memory_rss:19890176
used_memory_rss_human:18.97M
used_memory_peak:20035016
used_memory_peak_human:19.11M
used_memory_peak_perc:27.77%
used_memory_overhead:4644571
used_memory_startup:3216024
used_memory_dataset:919133
used_memory_dataset_perc:39.15%
allocator_allocated:6434128
allocator_active:7786496
allocator_resident:12460032
total_system_memory:67353542656
total_system_memory_human:62.73G
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:0
maxmemory_human:0B
maxmemory_policy:noeviction
allocator_frag_ratio:1.21
allocator_frag_bytes:1352368
allocator_rss_ratio:1.60
allocator_rss_bytes:4673536
rss_overhead_ratio:1.60
rss_overhead_bytes:7430144
mem_fragmentation_ratio:3.61
mem_fragmentation_bytes:14378072
mem_not_counted_for_evict:10234
mem_replication_backlog:1048576
mem_clients_slaves:20512
mem_clients_normal:61635
mem_aof_buffer:10240
mem_allocator:jemalloc-5.2.1
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
storage_provider:none

# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:0
rdb_changes_since_last_save:18307
rdb_bgsave_in_progress:0
rdb_last_save_time:1686815659
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:0
rdb_current_bgsave_time_sec:-1
rdb_last_cow_size:1335296
aof_enabled:1
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:3375104
module_fork_in_progress:0
module_fork_last_cow_size:0
aof_current_size:53503249
aof_base_size:829446
aof_pending_rewrite:0
aof_buffer_length:8258
aof_rewrite_buffer_length:0
aof_pending_bio_fsync:1
aof_delayed_fsync:0

# Stats
total_connections_received:1043044
total_commands_processed:318688689
instantaneous_ops_per_sec:2209
total_net_input_bytes:23340769424
total_net_output_bytes:39376336108
instantaneous_input_kbps:159.02
instantaneous_output_kbps:267.64
rejected_connections:0
sync_full:2
sync_partial_ok:0
sync_partial_err:2
expired_keys:28165470
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:0
evicted_keys:0
keyspace_hits:37180
keyspace_misses:8
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:1029
total_forks:4874
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:362559
total_error_replies:362561
dump_payload_sanitizations:0
total_reads_processed:157252115
total_writes_processed:156200680
instantaneous_lock_contention:1
avg_lock_contention:0.187500
storage_provider_read_hits:0
storage_provider_read_misses:0

# Replication
role:active-replica
master_global_link_status:up
connected_masters:1
master_host:10.2.2.60
master_port:13062
master_link_status:up
master_last_io_seconds_ago:0
master_sync_in_progress:0
slave_read_repl_offset:23243729863
slave_repl_offset:23243729863
slave_priority:100
slave_read_only:0
replica_announced:1
connected_slaves:1
slave0:ip=172.17.0.1,port=6379,state=online,offset=39335645161,lag=1
master_failover_state:no-failover
master_replid:95198220ff1d5218e0d3ab4827eb4cafdfaf5606
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:39335906500
second_repl_offset:-1
repl_backlog_active:1
repl_backlog_size:1048576
repl_backlog_first_byte_offset:39334857925
repl_backlog_histlen:1048576

# CPU
used_cpu_sys:8319.863656
used_cpu_user:6645.456295
used_cpu_sys_children:25.666492
used_cpu_user_children:48.426543
server_threads:4
long_lock_waits:6
used_cpu_sys_main_thread:6656.645504
used_cpu_user_main_thread:5406.061082

# Modules

# Errorstats
errorstat_ERR:count=362560
errorstat_LOADING:count=1

# Cluster
cluster_enabled:0

# Keyspace
db0:keys=3719,expires=2712,avg_ttl=289,cached_keys=3719

# KeyDB
mvcc_depth:0

It should be noted that rudi instance is less active, but we had the problem with it too a week ago, we upgraded from 6.3.1 to 6.3.3 in the hope it will fix the issue when it happened.

On rex, we can see:

rdb_changes_since_last_save:14470141
rdb_bgsave_in_progress:1
rdb_last_save_time:1686789002
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:0
rdb_current_bgsave_time_sec:26359
rdb_last_cow_size:1380352
root@rex ~ # ps aux|grep keydb
systemd+ 2419957 11.4  0.0 334556 20208 ?        Ssl  Jun12 483:21 keydb-server *:6379
systemd+ 2948749  0.0  0.0 334556 10640 ?        S    00:31   0:00 keydb-server *:6379

Killing the stuck process:

kill 2419957

After few seconds, on rex:

127.0.0.1:13062> info persistence
# Persistence
loading:0
current_cow_size:0
current_cow_size_age:0
current_fork_perc:0.00
current_save_keys_processed:0
current_save_keys_total:0
rdb_changes_since_last_save:32396
rdb_bgsave_in_progress:0
rdb_last_save_time:1686816870
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:0
rdb_current_bgsave_time_sec:-1
rdb_last_cow_size:1728512
aof_enabled:1
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:0
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:29544448
module_fork_in_progress:0
module_fork_last_cow_size:0
aof_current_size:10898869
aof_base_size:8063487
aof_pending_rewrite:0
aof_buffer_length:0
aof_rewrite_buffer_length:0
aof_pending_bio_fsync:0
aof_delayed_fsync:0

To reproduce

Steps to reproduce the behavior and/or a minimal code sample.

Expected behavior

A description of what you expected to happen.

Additional information

Any additional information that is relevant to the problem.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions