Skip to content

rpc 命令执行异常 #337

@lizengwu

Description

@lizengwu

基于3台同配置服务器搭建3FS集群,在启动mgmtd_main进程后,执行list_nodes指令,命令回显时而正常时而异常,根据日志显示,socket 无法set为RTR,部署mgmtd_main在其中一台服务器上,这台服务器有2张RDMA网卡,其中一张是双口CX6做bond1 101.1.1.165,另一张国产双口网卡做bond0 100.1.1.165,使用3FS版本是2025.08.28号 git clone的最新版本,
执行命令:
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://101.1.1.165:9000"]' "list-nodes"
执行命令异常回显:
Encounter error: 6004(MgmtdClient::RoutingInfoNotReady)

相关配置文件如下:
root@165:# cat /opt/3fs/etc/mgmtd_main_app.toml
allow_empty_node_id = true
node_id = 1
root@165:
# cat /opt/3fs/etc/mgmtd_main.toml
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'

[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/mgmtd_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'

[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/mgmtd_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'

[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/mgmtd_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'

[common.memory]
prof_active = false
prof_prefix = ''

[common.monitor]
collect_period = '1s'
num_collectors = 1

[[common.monitor.reporters]]
type = 'monitor_collector'

[common.monitor.reporters.monitor_collector]
remote_ip = "101.1.1.165:10000"

[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'

[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'

[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'RDMA'
services = [ 'Mgmtd' ]
use_independent_thread_pool = false

[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'

[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4

[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 1
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14

[server.base.groups.io_worker.transport_pool]
max_connections = 1

[server.base.groups.listener]
domain_socket_index = 1
filter_list = [ 'bond1' ]
listen_port = 8000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false

[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'

[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'TCP'
services = [ 'Core' ]
use_independent_thread_pool = true

[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'

[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4

[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 1
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14

[server.base.groups.io_worker.transport_pool]
max_connections = 1

[server.base.groups.listener]
domain_socket_index = 1
filter_list = [ ]
listen_port = 9000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false

[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'

[server.service]
allow_heartbeat_from_unregistered = true
authenticate = false
bootstrapping_length = '2min'
bump_routing_info_version_interval = '5s'
check_status_interval = '10s'
client_session_timeout = '20min'
enable_routinginfo_cache = true
extend_lease_check_release_version = true
extend_lease_interval = '10s'
heartbeat_fail_interval = '1min'
heartbeat_ignore_stale_targets = true
heartbeat_ignore_unknown_targets = false
heartbeat_timestamp_valid_window = '30s'
lease_length = '1min'
new_chain_bootstrap_interval = '2min'
only_accept_client_uuid = false
retry_times_on_txn_errors = -1
send_heartbeat = true
send_heartbeat_interval = '10s'
suspicious_lease_interval = '20s'
target_info_load_interval = '1s'
target_info_persist_batch = 1000
target_info_persist_interval = '1s'
try_adjust_target_order_as_preferred = false
update_chains_interval = '1s'
update_metrics_interval = '1s'
validate_lease_on_write = true

[server.service.retry_transaction]
max_backoff = '1s'
max_retry_count = 10

[server.service.user_cache]
buckets = 127
exist_ttl = '5min'
inexist_ttl = '10s'
root@165:# cat /opt/3fs/etc/mgmtd_main
mgmtd_main_app.toml mgmtd_main_launcher.toml mgmtd_main.toml
root@165:
# cat /opt/3fs/etc/mgmtd_main_launcher.toml
allow_dev_version = true
cluster_id = 'nebulamatrix'
use_memkv = false

[fdb]
casual_read_risky = false
clusterFile = '/opt/3fs/etc/fdb.cluster'
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'

[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = [ ]
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []

[kv_engine]
use_memkv = false

[kv_engine.fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
网卡配置如下:
4: ens5f0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond0 state UP group default qlen 1000
link/ether fa:8c:37:65:6b:8e brd ff:ff:ff:ff:ff:ff permaddr 68:be:49:28:20:0e
altname enp49s0f0
5: ens5f1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond0 state UP group default qlen 1000
link/ether fa:8c:37:65:6b:8e brd ff:ff:ff:ff:ff:ff permaddr 68:be:49:28:20:0f
altname enp49s0f1
6: ens9f0np0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond1 state UP group default qlen 1000
link/ether 16:70:52:0c:aa:64 brd ff:ff:ff:ff:ff:ff permaddr e8:eb:d3:3a:58:5c
altname enp75s0f0np0
7: ens9f1np1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond1 state UP group default qlen 1000
link/ether 16:70:52:0c:aa:64 brd ff:ff:ff:ff:ff:ff permaddr e8:eb:d3:3a:58:5d
altname enp75s0f1np1
8: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether fa:8c:37:65:6b:8e brd ff:ff:ff:ff:ff:ff
inet 100.1.1.165/24 brd 100.1.1.255 scope global bond0
valid_lft forever preferred_lft forever
inet6 fe80::f88c:37ff:fe65:6b8e/64 scope link
valid_lft forever preferred_lft forever
9: bond1: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 16:70:52:0c:aa:64 brd ff:ff:ff:ff:ff:ff
inet 101.1.1.165/24 brd 101.1.1.255 scope global bond1
valid_lft forever preferred_lft forever
inet6 fe80::1470:52ff:fe0c:aa64/64 scope link
valid_lft forever preferred_lft forever

mgmtd_main.log
mgmtd_main-err.log

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions