Skip to content

Commit a6b118f

Browse files
Florian Westphaldavem330
Florian Westphal
authored andcommitted
mptcp: add receive buffer auto-tuning
When mptcp is used, userspace doesn't read from the tcp (subflow) socket but from the parent (mptcp) socket receive queue. skbs are moved from the subflow socket to the mptcp rx queue either from 'data_ready' callback (if mptcp socket can be locked), a work queue, or the socket receive function. This means tcp_rcv_space_adjust() is never called and thus no receive buffer size auto-tuning is done. An earlier (not merged) patch added tcp_rcv_space_adjust() calls to the function that moves skbs from subflow to mptcp socket. While this enabled autotuning, it also meant tuning was done even if userspace was reading the mptcp socket very slowly. This adds mptcp_rcv_space_adjust() and calls it after userspace has read data from the mptcp socket rx queue. Its very similar to tcp_rcv_space_adjust, with two differences: 1. The rtt estimate is the largest one observed on a subflow 2. The rcvbuf size and window clamp of all subflows is adjusted to the mptcp-level rcvbuf. Otherwise, we get spurious drops at tcp (subflow) socket level if the skbs are not moved to the mptcp socket fast enough. Before: time mptcp_connect.sh -t -f $((4*1024*1024)) -d 300 -l 0.01% -r 0 -e "" -m mmap [..] ns4 MPTCP -> ns3 (10.0.3.2:10108 ) MPTCP (duration 40823ms) [ OK ] ns4 MPTCP -> ns3 (10.0.3.2:10109 ) TCP (duration 23119ms) [ OK ] ns4 TCP -> ns3 (10.0.3.2:10110 ) MPTCP (duration 5421ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10111) MPTCP (duration 41446ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10112) TCP (duration 23427ms) [ OK ] ns4 TCP -> ns3 (dead:beef:3::2:10113) MPTCP (duration 5426ms) [ OK ] Time: 1396 seconds After: ns4 MPTCP -> ns3 (10.0.3.2:10108 ) MPTCP (duration 5417ms) [ OK ] ns4 MPTCP -> ns3 (10.0.3.2:10109 ) TCP (duration 5427ms) [ OK ] ns4 TCP -> ns3 (10.0.3.2:10110 ) MPTCP (duration 5422ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10111) MPTCP (duration 5415ms) [ OK ] ns4 MPTCP -> ns3 (dead:beef:3::2:10112) TCP (duration 5422ms) [ OK ] ns4 TCP -> ns3 (dead:beef:3::2:10113) MPTCP (duration 5423ms) [ OK ] Time: 296 seconds Signed-off-by: Florian Westphal <[email protected]> Reviewed-by: Matthieu Baerts <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 767659f commit a6b118f

File tree

3 files changed

+127
-8
lines changed

3 files changed

+127
-8
lines changed

net/mptcp/protocol.c

+116-7
Original file line numberDiff line numberDiff line change
@@ -179,13 +179,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
179179
return false;
180180
}
181181

182-
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
183-
int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
184-
185-
if (rcvbuf > sk->sk_rcvbuf)
186-
sk->sk_rcvbuf = rcvbuf;
187-
}
188-
189182
tp = tcp_sk(ssk);
190183
do {
191184
u32 map_remaining, offset;
@@ -916,6 +909,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
916909
return copied;
917910
}
918911

912+
/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
913+
*
914+
* Only difference: Use highest rtt estimate of the subflows in use.
915+
*/
916+
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
917+
{
918+
struct mptcp_subflow_context *subflow;
919+
struct sock *sk = (struct sock *)msk;
920+
u32 time, advmss = 1;
921+
u64 rtt_us, mstamp;
922+
923+
sock_owned_by_me(sk);
924+
925+
if (copied <= 0)
926+
return;
927+
928+
msk->rcvq_space.copied += copied;
929+
930+
mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
931+
time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
932+
933+
rtt_us = msk->rcvq_space.rtt_us;
934+
if (rtt_us && time < (rtt_us >> 3))
935+
return;
936+
937+
rtt_us = 0;
938+
mptcp_for_each_subflow(msk, subflow) {
939+
const struct tcp_sock *tp;
940+
u64 sf_rtt_us;
941+
u32 sf_advmss;
942+
943+
tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
944+
945+
sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
946+
sf_advmss = READ_ONCE(tp->advmss);
947+
948+
rtt_us = max(sf_rtt_us, rtt_us);
949+
advmss = max(sf_advmss, advmss);
950+
}
951+
952+
msk->rcvq_space.rtt_us = rtt_us;
953+
if (time < (rtt_us >> 3) || rtt_us == 0)
954+
return;
955+
956+
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
957+
goto new_measure;
958+
959+
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
960+
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
961+
int rcvmem, rcvbuf;
962+
u64 rcvwin, grow;
963+
964+
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
965+
966+
grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
967+
968+
do_div(grow, msk->rcvq_space.space);
969+
rcvwin += (grow << 1);
970+
971+
rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
972+
while (tcp_win_from_space(sk, rcvmem) < advmss)
973+
rcvmem += 128;
974+
975+
do_div(rcvwin, advmss);
976+
rcvbuf = min_t(u64, rcvwin * rcvmem,
977+
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
978+
979+
if (rcvbuf > sk->sk_rcvbuf) {
980+
u32 window_clamp;
981+
982+
window_clamp = tcp_win_from_space(sk, rcvbuf);
983+
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
984+
985+
/* Make subflows follow along. If we do not do this, we
986+
* get drops at subflow level if skbs can't be moved to
987+
* the mptcp rx queue fast enough (announced rcv_win can
988+
* exceed ssk->sk_rcvbuf).
989+
*/
990+
mptcp_for_each_subflow(msk, subflow) {
991+
struct sock *ssk;
992+
993+
ssk = mptcp_subflow_tcp_sock(subflow);
994+
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
995+
tcp_sk(ssk)->window_clamp = window_clamp;
996+
}
997+
}
998+
}
999+
1000+
msk->rcvq_space.space = msk->rcvq_space.copied;
1001+
new_measure:
1002+
msk->rcvq_space.copied = 0;
1003+
msk->rcvq_space.time = mstamp;
1004+
}
1005+
9191006
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
9201007
{
9211008
unsigned int moved = 0;
@@ -1028,6 +1115,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
10281115
set_bit(MPTCP_DATA_READY, &msk->flags);
10291116
}
10301117
out_err:
1118+
mptcp_rcv_space_adjust(msk, copied);
1119+
10311120
release_sock(sk);
10321121
return copied;
10331122
}
@@ -1241,6 +1330,7 @@ static int mptcp_init_sock(struct sock *sk)
12411330
return ret;
12421331

12431332
sk_sockets_allocated_inc(sk);
1333+
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
12441334
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
12451335

12461336
return 0;
@@ -1423,6 +1513,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
14231513
return nsk;
14241514
}
14251515

1516+
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
1517+
{
1518+
const struct tcp_sock *tp = tcp_sk(ssk);
1519+
1520+
msk->rcvq_space.copied = 0;
1521+
msk->rcvq_space.rtt_us = 0;
1522+
1523+
msk->rcvq_space.time = tp->tcp_mstamp;
1524+
1525+
/* initial rcv_space offering made to peer */
1526+
msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
1527+
TCP_INIT_CWND * tp->advmss);
1528+
if (msk->rcvq_space.space == 0)
1529+
msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
1530+
}
1531+
14261532
static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
14271533
bool kern)
14281534
{
@@ -1471,6 +1577,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
14711577
list_add(&subflow->node, &msk->conn_list);
14721578
inet_sk_state_store(newsk, TCP_ESTABLISHED);
14731579

1580+
mptcp_rcv_space_init(msk, ssk);
14741581
bh_unlock_sock(new_mptcp_sock);
14751582

14761583
__MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
@@ -1631,6 +1738,8 @@ void mptcp_finish_connect(struct sock *ssk)
16311738
atomic64_set(&msk->snd_una, msk->write_seq);
16321739

16331740
mptcp_pm_new_connection(msk, 0);
1741+
1742+
mptcp_rcv_space_init(msk, ssk);
16341743
}
16351744

16361745
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)

net/mptcp/protocol.h

+7
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ struct mptcp_sock {
209209
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
210210
struct sock *first;
211211
struct mptcp_pm_data pm;
212+
struct {
213+
u32 space; /* bytes copied in last measurement window */
214+
u32 copied; /* bytes copied in this measurement window */
215+
u64 time; /* start time of measurement window */
216+
u64 rtt_us; /* last maximum rtt of subflows */
217+
} rcvq_space;
212218
};
213219

214220
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -369,6 +375,7 @@ void mptcp_get_options(const struct sk_buff *skb,
369375
struct mptcp_options_received *mp_opt);
370376

371377
void mptcp_finish_connect(struct sock *sk);
378+
void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
372379
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
373380
bool mptcp_finish_join(struct sock *sk);
374381
void mptcp_data_acked(struct sock *sk);

net/mptcp/subflow.c

+4-1
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
225225
pr_fallback(mptcp_sk(subflow->conn));
226226
}
227227

228-
if (mptcp_check_fallback(sk))
228+
if (mptcp_check_fallback(sk)) {
229+
mptcp_rcv_space_init(mptcp_sk(parent), sk);
229230
return;
231+
}
230232

231233
if (subflow->mp_capable) {
232234
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
@@ -1118,6 +1120,7 @@ static void subflow_state_change(struct sock *sk)
11181120

11191121
if (subflow_simultaneous_connect(sk)) {
11201122
mptcp_do_fallback(sk);
1123+
mptcp_rcv_space_init(mptcp_sk(parent), sk);
11211124
pr_fallback(mptcp_sk(parent));
11221125
subflow->conn_finished = 1;
11231126
if (inet_sk_state_load(parent) == TCP_SYN_SENT) {

0 commit comments

Comments
 (0)