Skip to content

Commit 7282c12

Browse files
committed
Merge branch 'smc-datapath-opts'
Dust Li says: ==================== net/smc: some datapath performance optimizations This series tries to improve the performance of SMC in datapath. - patch #1, add sysctl interface to support tuning the behaviour of SMC in container environment. - patch #2/#3, add autocorking support which is very efficient for small messages without trade-off for latency. - patch #4, send directly on setting TCP_NODELAY, without wake up the TX worker, this make it consistent with clearing TCP_CORK. - patch #5, this correct the setting of RMB window update limit, so we don't send CDC messages to update peer's RMB window too frequently in some cases. - patch #6, implemented something like NAPI in SMC, decrease the number of hardirq when busy. - patch #7, this moves TX work doing in the BH to the user context when sock_lock is hold by user. With this patchset applied, we can get a good performance gain: - qperf tcp_bw test has shown a great improvement. Other benchmarks like 'netperf TCP_STREAM' or 'sockperf throughput' has similar result. - In my testing environment, running qperf tcp_bw and tcp_lat, SMC behaves better then TCP in most all message size. Here are some test results with the following testing command: client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \ -t 30 -vu tcp_{bw|lat} server: smc_run taskset -c 1 qperf ==== Bandwidth ==== MsgSize Origin SMC TCP SMC with patches 1 0.578 MB/s 2.392 MB/s(313.57%) 2.561 MB/s(342.83%) 2 1.159 MB/s 4.780 MB/s(312.53%) 5.162 MB/s(345.46%) 4 2.283 MB/s 10.266 MB/s(349.77%) 10.122 MB/s(343.46%) 8 4.668 MB/s 19.040 MB/s(307.86%) 20.521 MB/s(339.59%) 16 9.147 MB/s 38.904 MB/s(325.31%) 40.823 MB/s(346.29%) 32 18.369 MB/s 79.587 MB/s(333.25%) 80.535 MB/s(338.42%) 64 36.562 MB/s 148.668 MB/s(306.61%) 158.170 MB/s(332.60%) 128 72.961 MB/s 274.913 MB/s(276.80%) 316.217 MB/s(333.41%) 256 144.705 MB/s 512.059 MB/s(253.86%) 626.019 MB/s(332.62%) 512 288.873 MB/s 884.977 MB/s(206.35%) 1221.596 MB/s(322.88%) 1024 574.180 MB/s 1337.736 MB/s(132.98%) 2203.156 MB/s(283.70%) 2048 1095.192 MB/s 1865.952 MB/s( 70.38%) 3036.448 MB/s(177.25%) 4096 2066.157 MB/s 2380.337 MB/s( 15.21%) 3834.271 MB/s( 85.58%) 8192 3717.198 MB/s 2733.073 MB/s(-26.47%) 4904.910 MB/s( 31.95%) 16384 4742.221 MB/s 2958.693 MB/s(-37.61%) 5220.272 MB/s( 10.08%) 32768 5349.550 MB/s 3061.285 MB/s(-42.77%) 5321.865 MB/s( -0.52%) 65536 5162.919 MB/s 3731.408 MB/s(-27.73%) 5245.021 MB/s( 1.59%) ==== Latency ==== MsgSize Origin SMC TCP SMC with patches 1 10.540 us 11.938 us( 13.26%) 10.356 us( -1.75%) 2 10.996 us 11.992 us( 9.06%) 10.073 us( -8.39%) 4 10.229 us 11.687 us( 14.25%) 9.996 us( -2.28%) 8 10.203 us 11.653 us( 14.21%) 10.063 us( -1.37%) 16 10.530 us 11.313 us( 7.44%) 10.013 us( -4.91%) 32 10.241 us 11.586 us( 13.13%) 10.081 us( -1.56%) 64 10.693 us 11.652 us( 8.97%) 9.986 us( -6.61%) 128 10.597 us 11.579 us( 9.27%) 10.262 us( -3.16%) 256 10.409 us 11.957 us( 14.87%) 10.148 us( -2.51%) 512 11.088 us 12.505 us( 12.78%) 10.206 us( -7.95%) 1024 11.240 us 12.255 us( 9.03%) 10.631 us( -5.42%) 2048 11.485 us 16.970 us( 47.76%) 10.981 us( -4.39%) 4096 12.077 us 13.948 us( 15.49%) 11.847 us( -1.90%) 8192 13.683 us 16.693 us( 22.00%) 13.336 us( -2.54%) 16384 16.470 us 23.615 us( 43.38%) 16.519 us( 0.30%) 32768 22.540 us 40.966 us( 81.75%) 22.452 us( -0.39%) 65536 34.192 us 73.003 us(113.51%) 33.916 us( -0.81%) ------------ Test environment notes: 1. Testing is run on 2 VMs within the same physical host 2. The NIC is ConnectX-4Lx, using SRIOV, and passing through 2 VFs to the 2 VMs respectively. 3. To decrease jitter, VM's vCPU are binded to each physical CPU, and those physical CPUs are all isolated using boot parameter `isolcpus=xxx` 4. The queue number are set to 1, and interrupt from the queue is binded to CPU0 in the guest ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 1e385c0 + 6b88af8 commit 7282c12

File tree

11 files changed

+317
-42
lines changed

11 files changed

+317
-42
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.. SPDX-License-Identifier: GPL-2.0
2+
3+
=========
4+
SMC Sysctl
5+
=========
6+
7+
/proc/sys/net/smc/* Variables
8+
==============================
9+
10+
autocorking_size - INTEGER
11+
Setting SMC auto corking size:
12+
SMC auto corking is like TCP auto corking from the application's
13+
perspective of view. When applications do consecutive small
14+
write()/sendmsg() system calls, we try to coalesce these small writes
15+
as much as possible, to lower total amount of CDC and RDMA Write been
16+
sent.
17+
autocorking_size limits the maximum corked bytes that can be sent to
18+
the under device in 1 single sending. If set to 0, the SMC auto corking
19+
is disabled.
20+
Applications can still use TCP_CORK for optimal behavior when they
21+
know how/when to uncork their sockets.
22+
23+
Default: 64K

include/net/netns/smc.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,9 @@ struct netns_smc {
1414
struct smc_stats_rsn *fback_rsn;
1515

1616
bool limit_smc_hs; /* constraint on handshake */
17+
#ifdef CONFIG_SYSCTL
18+
struct ctl_table_header *smc_hdr;
19+
#endif
20+
unsigned int sysctl_autocorking_size;
1721
};
1822
#endif

net/smc/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o
44
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
55
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
66
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
7-
smc-y += smc_tracepoint.o
7+
smc-y += smc_tracepoint.o smc_sysctl.o

net/smc/af_smc.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include "smc_close.h"
5252
#include "smc_stats.h"
5353
#include "smc_tracepoint.h"
54+
#include "smc_sysctl.h"
5455

5556
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
5657
* creation on server
@@ -192,12 +193,27 @@ void smc_unhash_sk(struct sock *sk)
192193
}
193194
EXPORT_SYMBOL_GPL(smc_unhash_sk);
194195

196+
/* This will be called before user really release sock_lock. So do the
197+
* work which we didn't do because of user hold the sock_lock in the
198+
* BH context
199+
*/
200+
static void smc_release_cb(struct sock *sk)
201+
{
202+
struct smc_sock *smc = smc_sk(sk);
203+
204+
if (smc->conn.tx_in_release_sock) {
205+
smc_tx_pending(&smc->conn);
206+
smc->conn.tx_in_release_sock = false;
207+
}
208+
}
209+
195210
struct proto smc_proto = {
196211
.name = "SMC",
197212
.owner = THIS_MODULE,
198213
.keepalive = smc_set_keepalive,
199214
.hash = smc_hash_sk,
200215
.unhash = smc_unhash_sk,
216+
.release_cb = smc_release_cb,
201217
.obj_size = sizeof(struct smc_sock),
202218
.h.smc_hash = &smc_v4_hashinfo,
203219
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -210,6 +226,7 @@ struct proto smc_proto6 = {
210226
.keepalive = smc_set_keepalive,
211227
.hash = smc_hash_sk,
212228
.unhash = smc_unhash_sk,
229+
.release_cb = smc_release_cb,
213230
.obj_size = sizeof(struct smc_sock),
214231
.h.smc_hash = &smc_v6_hashinfo,
215232
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -2795,8 +2812,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
27952812
sk->sk_state != SMC_CLOSED) {
27962813
if (val) {
27972814
SMC_STAT_INC(smc, ndly_cnt);
2798-
mod_delayed_work(smc->conn.lgr->tx_wq,
2799-
&smc->conn.tx_work, 0);
2815+
smc_tx_pending(&smc->conn);
2816+
cancel_delayed_work(&smc->conn.tx_work);
28002817
}
28012818
}
28022819
break;
@@ -3273,9 +3290,17 @@ static int __init smc_init(void)
32733290
goto out_sock;
32743291
}
32753292

3293+
rc = smc_sysctl_init();
3294+
if (rc) {
3295+
pr_err("%s: sysctl_init fails with %d\n", __func__, rc);
3296+
goto out_ulp;
3297+
}
3298+
32763299
static_branch_enable(&tcp_have_smc);
32773300
return 0;
32783301

3302+
out_ulp:
3303+
tcp_unregister_ulp(&smc_ulp_ops);
32793304
out_sock:
32803305
sock_unregister(PF_SMC);
32813306
out_proto6:
@@ -3303,6 +3328,7 @@ static int __init smc_init(void)
33033328
static void __exit smc_exit(void)
33043329
{
33053330
static_branch_disable(&tcp_have_smc);
3331+
smc_sysctl_exit();
33063332
tcp_unregister_ulp(&smc_ulp_ops);
33073333
sock_unregister(PF_SMC);
33083334
smc_core_exit();

net/smc/smc.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
3030
* devices
3131
*/
32+
#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */
3233

3334
extern struct proto smc_proto;
3435
extern struct proto smc_proto6;
@@ -192,6 +193,7 @@ struct smc_connection {
192193
* - dec on polled tx cqe
193194
*/
194195
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
196+
atomic_t tx_pushing; /* nr_threads trying tx push */
195197
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
196198
u32 tx_off; /* base offset in peer rmb */
197199

@@ -211,6 +213,10 @@ struct smc_connection {
211213
* data still pending
212214
*/
213215
char urg_rx_byte; /* urgent byte */
216+
bool tx_in_release_sock;
217+
/* flush pending tx data in
218+
* sock release_cb()
219+
*/
214220
atomic_t bytes_to_rcv; /* arrived data,
215221
* not yet received
216222
*/

net/smc/smc_cdc.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,19 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
4848
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
4949
}
5050

51-
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
52-
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
53-
wake_up(&conn->cdc_pend_tx_wq);
51+
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
52+
/* If user owns the sock_lock, mark the connection need sending.
53+
* User context will later try to send when it release sock_lock
54+
* in smc_release_cb()
55+
*/
56+
if (sock_owned_by_user(&smc->sk))
57+
conn->tx_in_release_sock = true;
58+
else
59+
smc_tx_pending(conn);
60+
61+
if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
62+
wake_up(&conn->cdc_pend_tx_wq);
63+
}
5464
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
5565

5666
smc_tx_sndbuf_nonfull(smc);
@@ -350,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
350360
/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
351361
if ((diff_cons && smc_tx_prepared_sends(conn)) ||
352362
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
353-
conn->local_rx_ctrl.prod_flags.urg_data_pending)
354-
smc_tx_sndbuf_nonempty(conn);
363+
conn->local_rx_ctrl.prod_flags.urg_data_pending) {
364+
if (!sock_owned_by_user(&smc->sk))
365+
smc_tx_pending(conn);
366+
else
367+
conn->tx_in_release_sock = true;
368+
}
355369

356370
if (diff_cons && conn->urg_tx_pend &&
357371
atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {

net/smc/smc_core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1988,7 +1988,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
19881988
*/
19891989
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
19901990
{
1991-
return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
1991+
return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
19921992
}
19931993

19941994
/* map an rmb buf to a link */

net/smc/smc_sysctl.c

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Shared Memory Communications over RDMA (SMC-R) and RoCE
4+
*
5+
* smc_sysctl.c: sysctl interface to SMC subsystem.
6+
*
7+
* Copyright (c) 2022, Alibaba Inc.
8+
*
9+
* Author: Tony Lu <[email protected]>
10+
*
11+
*/
12+
13+
#include <linux/init.h>
14+
#include <linux/sysctl.h>
15+
#include <net/net_namespace.h>
16+
17+
#include "smc.h"
18+
#include "smc_sysctl.h"
19+
20+
static struct ctl_table smc_table[] = {
21+
{
22+
.procname = "autocorking_size",
23+
.data = &init_net.smc.sysctl_autocorking_size,
24+
.maxlen = sizeof(unsigned int),
25+
.mode = 0644,
26+
.proc_handler = proc_douintvec,
27+
},
28+
{ }
29+
};
30+
31+
static __net_init int smc_sysctl_init_net(struct net *net)
32+
{
33+
struct ctl_table *table;
34+
35+
table = smc_table;
36+
if (!net_eq(net, &init_net)) {
37+
int i;
38+
39+
table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
40+
if (!table)
41+
goto err_alloc;
42+
43+
for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
44+
table[i].data += (void *)net - (void *)&init_net;
45+
}
46+
47+
net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
48+
if (!net->smc.smc_hdr)
49+
goto err_reg;
50+
51+
net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
52+
53+
return 0;
54+
55+
err_reg:
56+
if (!net_eq(net, &init_net))
57+
kfree(table);
58+
err_alloc:
59+
return -ENOMEM;
60+
}
61+
62+
static __net_exit void smc_sysctl_exit_net(struct net *net)
63+
{
64+
unregister_net_sysctl_table(net->smc.smc_hdr);
65+
}
66+
67+
static struct pernet_operations smc_sysctl_ops __net_initdata = {
68+
.init = smc_sysctl_init_net,
69+
.exit = smc_sysctl_exit_net,
70+
};
71+
72+
int __init smc_sysctl_init(void)
73+
{
74+
return register_pernet_subsys(&smc_sysctl_ops);
75+
}
76+
77+
void smc_sysctl_exit(void)
78+
{
79+
unregister_pernet_subsys(&smc_sysctl_ops);
80+
}

net/smc/smc_sysctl.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Shared Memory Communications over RDMA (SMC-R) and RoCE
4+
*
5+
* smc_sysctl.c: sysctl interface to SMC subsystem.
6+
*
7+
* Copyright (c) 2022, Alibaba Inc.
8+
*
9+
* Author: Tony Lu <[email protected]>
10+
*
11+
*/
12+
13+
#ifndef _SMC_SYSCTL_H
14+
#define _SMC_SYSCTL_H
15+
16+
#ifdef CONFIG_SYSCTL
17+
18+
int smc_sysctl_init(void);
19+
void smc_sysctl_exit(void);
20+
21+
#else
22+
23+
int smc_sysctl_init(void)
24+
{
25+
return 0;
26+
}
27+
28+
void smc_sysctl_exit(void) { }
29+
30+
#endif /* CONFIG_SYSCTL */
31+
32+
#endif /* _SMC_SYSCTL_H */

0 commit comments

Comments
 (0)