-
Notifications
You must be signed in to change notification settings - Fork 948
Description
基于3台同配置服务器搭建3FS集群,在启动mgmtd_main进程后,执行list_nodes指令,命令回显时而正常时而异常,根据日志显示,socket 无法set为RTR,部署mgmtd_main在其中一台服务器上,这台服务器有2张RDMA网卡,其中一张是双口CX6做bond1 101.1.1.165,另一张国产双口网卡做bond0 100.1.1.165,使用3FS版本是2025.08.28号 git clone的最新版本,
执行命令:
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://101.1.1.165:9000"]' "list-nodes"
执行命令异常回显:
Encounter error: 6004(MgmtdClient::RoutingInfoNotReady)
相关配置文件如下:
root@165:# cat /opt/3fs/etc/mgmtd_main_app.toml# cat /opt/3fs/etc/mgmtd_main.toml
allow_empty_node_id = true
node_id = 1
root@165:
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/mgmtd_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/mgmtd_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/mgmtd_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[common.memory]
prof_active = false
prof_prefix = ''
[common.monitor]
collect_period = '1s'
num_collectors = 1
[[common.monitor.reporters]]
type = 'monitor_collector'
[common.monitor.reporters.monitor_collector]
remote_ip = "101.1.1.165:10000"
[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'RDMA'
services = [ 'Mgmtd' ]
use_independent_thread_pool = false
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 1
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = [ 'bond1' ]
listen_port = 8000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'TCP'
services = [ 'Core' ]
use_independent_thread_pool = true
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 1
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = [ ]
listen_port = 9000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.service]
allow_heartbeat_from_unregistered = true
authenticate = false
bootstrapping_length = '2min'
bump_routing_info_version_interval = '5s'
check_status_interval = '10s'
client_session_timeout = '20min'
enable_routinginfo_cache = true
extend_lease_check_release_version = true
extend_lease_interval = '10s'
heartbeat_fail_interval = '1min'
heartbeat_ignore_stale_targets = true
heartbeat_ignore_unknown_targets = false
heartbeat_timestamp_valid_window = '30s'
lease_length = '1min'
new_chain_bootstrap_interval = '2min'
only_accept_client_uuid = false
retry_times_on_txn_errors = -1
send_heartbeat = true
send_heartbeat_interval = '10s'
suspicious_lease_interval = '20s'
target_info_load_interval = '1s'
target_info_persist_batch = 1000
target_info_persist_interval = '1s'
try_adjust_target_order_as_preferred = false
update_chains_interval = '1s'
update_metrics_interval = '1s'
validate_lease_on_write = true
[server.service.retry_transaction]
max_backoff = '1s'
max_retry_count = 10
[server.service.user_cache]
buckets = 127
exist_ttl = '5min'
inexist_ttl = '10s'
root@165:# cat /opt/3fs/etc/mgmtd_main# cat /opt/3fs/etc/mgmtd_main_launcher.toml
mgmtd_main_app.toml mgmtd_main_launcher.toml mgmtd_main.toml
root@165:
allow_dev_version = true
cluster_id = 'nebulamatrix'
use_memkv = false
[fdb]
casual_read_risky = false
clusterFile = '/opt/3fs/etc/fdb.cluster'
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = [ ]
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []
[kv_engine]
use_memkv = false
[kv_engine.fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
网卡配置如下:
4: ens5f0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond0 state UP group default qlen 1000
link/ether fa:8c:37:65:6b:8e brd ff:ff:ff:ff:ff:ff permaddr 68:be:49:28:20:0e
altname enp49s0f0
5: ens5f1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond0 state UP group default qlen 1000
link/ether fa:8c:37:65:6b:8e brd ff:ff:ff:ff:ff:ff permaddr 68:be:49:28:20:0f
altname enp49s0f1
6: ens9f0np0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond1 state UP group default qlen 1000
link/ether 16:70:52:0c:aa:64 brd ff:ff:ff:ff:ff:ff permaddr e8:eb:d3:3a:58:5c
altname enp75s0f0np0
7: ens9f1np1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond1 state UP group default qlen 1000
link/ether 16:70:52:0c:aa:64 brd ff:ff:ff:ff:ff:ff permaddr e8:eb:d3:3a:58:5d
altname enp75s0f1np1
8: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether fa:8c:37:65:6b:8e brd ff:ff:ff:ff:ff:ff
inet 100.1.1.165/24 brd 100.1.1.255 scope global bond0
valid_lft forever preferred_lft forever
inet6 fe80::f88c:37ff:fe65:6b8e/64 scope link
valid_lft forever preferred_lft forever
9: bond1: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 16:70:52:0c:aa:64 brd ff:ff:ff:ff:ff:ff
inet 101.1.1.165/24 brd 101.1.1.255 scope global bond1
valid_lft forever preferred_lft forever
inet6 fe80::1470:52ff:fe0c:aa64/64 scope link
valid_lft forever preferred_lft forever