diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 141 |
1 files changed, 90 insertions, 51 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 02caeb7bcf..95618d0e78 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,11 +39,13 @@ #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> +#include <linux/skbuff_ref.h> #include <trace/events/tcp.h> @@ -203,16 +205,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, - __u32 *rcv_wnd, __u32 *window_clamp, + __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); + u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ - if (*window_clamp == 0) - (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); - space = min(*window_clamp, space); + if (window_clamp == 0) + window_clamp = (U16_MAX << TCP_MAX_WSCALE); + space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) @@ -229,7 +232,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else - (*rcv_wnd) = min_t(u32, space, U16_MAX); + (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); @@ -239,12 +242,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); - space = min_t(u32, space, *window_clamp); + space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); + WRITE_ONCE(*__window_clamp, + min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -1499,18 +1503,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { + int tso_segs; + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ - tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->tcp_gso_size = 0; - } else { - tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tcp_skb_pcount_set(skb, 1); + return 1; } + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, tso_segs); + return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various @@ -2070,16 +2078,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; - /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tcp_skb_pcount(skb) == 1) - return 1; - in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) @@ -2100,10 +2102,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(skb, mss_now); - tso_segs = tcp_skb_pcount(skb); - } + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) + return tcp_set_skb_tso_segs(skb, mss_now); + return tso_segs; } @@ -2403,6 +2404,21 @@ commit: return 0; } +/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if + * all its payload was moved to another one (dst). + * Make sure to transfer tcp_flags, eor, and tstamp. + */ +static void tcp_eat_one_skb(struct sock *sk, + struct sk_buff *dst, + struct sk_buff *src) +{ + TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; + TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; + tcp_skb_collapse_tstamp(dst, src); + tcp_unlink_write_queue(src, sk); + tcp_wmem_free_skb(sk, src); +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2508,16 +2524,7 @@ static int tcp_mtu_probe(struct sock *sk) copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { - /* We've eaten all the data from this skb. - * Throw it away. */ - TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; - /* If this is the last SKB we copy and eor is set - * we need to propagate it to the new skb. - */ - TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; - tcp_skb_collapse_tstamp(nskb, skb); - tcp_unlink_write_queue(skb, sk); - tcp_wmem_free_skb(sk, skb); + tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2683,6 +2690,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } +/* First skb in the write queue is smaller than ideal packet size. + * Check if we can move payload from the second skb in the queue. + */ +static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) +{ + struct sk_buff *next_skb = skb->next; + unsigned int nlen; + + if (tcp_skb_is_last(sk, skb)) + return; + + if (!tcp_skb_can_collapse(skb, next_skb)) + return; + + nlen = min_t(u32, amount, next_skb->len); + if (!nlen || !skb_shift(skb, next_skb, nlen)) + return; + + TCP_SKB_CB(skb)->end_seq += nlen; + TCP_SKB_CB(next_skb)->seq += nlen; + + if (!next_skb->len) { + /* In case FIN is set, we need to update end_seq */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + tcp_eat_one_skb(sk, skb, next_skb); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2703,10 +2739,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; - int cwnd_quota; + u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; - u32 max_segs; sent_pkts = 0; @@ -2724,6 +2759,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; + int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ @@ -2737,10 +2773,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_pacing_check(sk)) break; - tso_segs = tcp_init_tso_segs(skb, mss_now); - BUG_ON(!tso_segs); - - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ @@ -2748,6 +2781,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else break; } + cwnd_quota = min(cwnd_quota, max_segs); + missing_bytes = cwnd_quota * mss_now - skb->len; + if (missing_bytes > 0) + tcp_grow_skb(sk, skb, missing_bytes); + + tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; @@ -2769,9 +2808,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - max_segs), + cwnd_quota, nonagle); if (skb->len > limit && @@ -3387,11 +3424,6 @@ start: err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - /* To avoid taking spuriously low RTT samples based on a timestamp - * for a transmit that never happened, always mark EVER_RETRANS - */ - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); @@ -3401,6 +3433,12 @@ start: } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } + + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + return err; } @@ -3585,7 +3623,8 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ -void tcp_send_active_reset(struct sock *sk, gfp_t priority) +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason) { struct sk_buff *skb; @@ -3610,7 +3649,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ - trace_tcp_send_reset(sk, NULL); + trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); } /* Send a crossed SYN-ACK during socket establishment. @@ -3857,7 +3896,7 @@ static void tcp_connect_init(struct sock *sk) tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) - tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3865,7 +3904,7 @@ static void tcp_connect_init(struct sock *sk) /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); + WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) |