summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c141
1 files changed, 90 insertions, 51 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 02caeb7bcf..95618d0e78 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -39,11 +39,13 @@
#include <net/tcp.h>
#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/static_key.h>
+#include <linux/skbuff_ref.h>
#include <trace/events/tcp.h>
@@ -203,16 +205,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
- __u32 *rcv_wnd, __u32 *window_clamp,
+ __u32 *rcv_wnd, __u32 *__window_clamp,
int wscale_ok, __u8 *rcv_wscale,
__u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
+ u32 window_clamp = READ_ONCE(*__window_clamp);
/* If no clamp set the clamp to the max possible scaled window */
- if (*window_clamp == 0)
- (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
- space = min(*window_clamp, space);
+ if (window_clamp == 0)
+ window_clamp = (U16_MAX << TCP_MAX_WSCALE);
+ space = min(window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
@@ -229,7 +232,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
- (*rcv_wnd) = min_t(u32, space, U16_MAX);
+ (*rcv_wnd) = space;
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
@@ -239,12 +242,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
/* Set window scaling on max possible window */
space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
- space = min_t(u32, space, *window_clamp);
+ space = min_t(u32, space, window_clamp);
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
- (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
+ WRITE_ONCE(*__window_clamp,
+ min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
EXPORT_SYMBOL(tcp_select_initial_window);
@@ -1499,18 +1503,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
+ int tso_segs;
+
if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->tcp_gso_size = 0;
- } else {
- tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
- TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tcp_skb_pcount_set(skb, 1);
+ return 1;
}
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ tcp_skb_pcount_set(skb, tso_segs);
+ return tso_segs;
}
/* Pcount in the middle of the write queue got changed, we need to do various
@@ -2070,16 +2078,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
- const struct sk_buff *skb)
+static u32 tcp_cwnd_test(const struct tcp_sock *tp)
{
u32 in_flight, cwnd, halfcwnd;
- /* Don't be strict about the congestion window for the final FIN. */
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
- tcp_skb_pcount(skb) == 1)
- return 1;
-
in_flight = tcp_packets_in_flight(tp);
cwnd = tcp_snd_cwnd(tp);
if (in_flight >= cwnd)
@@ -2100,10 +2102,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
- if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(skb, mss_now);
- tso_segs = tcp_skb_pcount(skb);
- }
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
+ return tcp_set_skb_tso_segs(skb, mss_now);
+
return tso_segs;
}
@@ -2403,6 +2404,21 @@ commit:
return 0;
}
+/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if
+ * all its payload was moved to another one (dst).
+ * Make sure to transfer tcp_flags, eor, and tstamp.
+ */
+static void tcp_eat_one_skb(struct sock *sk,
+ struct sk_buff *dst,
+ struct sk_buff *src)
+{
+ TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags;
+ TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor;
+ tcp_skb_collapse_tstamp(dst, src);
+ tcp_unlink_write_queue(src, sk);
+ tcp_wmem_free_skb(sk, src);
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -2508,16 +2524,7 @@ static int tcp_mtu_probe(struct sock *sk)
copy = min_t(int, skb->len, probe_size - len);
if (skb->len <= copy) {
- /* We've eaten all the data from this skb.
- * Throw it away. */
- TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
- /* If this is the last SKB we copy and eor is set
- * we need to propagate it to the new skb.
- */
- TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
- tcp_skb_collapse_tstamp(nskb, skb);
- tcp_unlink_write_queue(skb, sk);
- tcp_wmem_free_skb(sk, skb);
+ tcp_eat_one_skb(sk, nskb, skb);
} else {
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
~(TCPHDR_FIN|TCPHDR_PSH);
@@ -2683,6 +2690,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
}
+/* First skb in the write queue is smaller than ideal packet size.
+ * Check if we can move payload from the second skb in the queue.
+ */
+static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
+{
+ struct sk_buff *next_skb = skb->next;
+ unsigned int nlen;
+
+ if (tcp_skb_is_last(sk, skb))
+ return;
+
+ if (!tcp_skb_can_collapse(skb, next_skb))
+ return;
+
+ nlen = min_t(u32, amount, next_skb->len);
+ if (!nlen || !skb_shift(skb, next_skb, nlen))
+ return;
+
+ TCP_SKB_CB(skb)->end_seq += nlen;
+ TCP_SKB_CB(next_skb)->seq += nlen;
+
+ if (!next_skb->len) {
+ /* In case FIN is set, we need to update end_seq */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ tcp_eat_one_skb(sk, skb, next_skb);
+ }
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2703,10 +2739,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
- int cwnd_quota;
+ u32 cwnd_quota, max_segs;
int result;
bool is_cwnd_limited = false, is_rwnd_limited = false;
- u32 max_segs;
sent_pkts = 0;
@@ -2724,6 +2759,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ int missing_bytes;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
@@ -2737,10 +2773,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (tcp_pacing_check(sk))
break;
- tso_segs = tcp_init_tso_segs(skb, mss_now);
- BUG_ON(!tso_segs);
-
- cwnd_quota = tcp_cwnd_test(tp, skb);
+ cwnd_quota = tcp_cwnd_test(tp);
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
@@ -2748,6 +2781,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
else
break;
}
+ cwnd_quota = min(cwnd_quota, max_segs);
+ missing_bytes = cwnd_quota * mss_now - skb->len;
+ if (missing_bytes > 0)
+ tcp_grow_skb(sk, skb, missing_bytes);
+
+ tso_segs = tcp_set_skb_tso_segs(skb, mss_now);
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
@@ -2769,9 +2808,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
- min_t(unsigned int,
- cwnd_quota,
- max_segs),
+ cwnd_quota,
nonagle);
if (skb->len > limit &&
@@ -3387,11 +3424,6 @@ start:
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
- /* To avoid taking spuriously low RTT samples based on a timestamp
- * for a transmit that never happened, always mark EVER_RETRANS
- */
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
-
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
@@ -3401,6 +3433,12 @@ start:
} else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
}
+
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
return err;
}
@@ -3585,7 +3623,8 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by RFC 2525, section 2.17. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+ enum sk_rst_reason reason)
{
struct sk_buff *skb;
@@ -3610,7 +3649,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
* skb here is different to the troublesome skb, so use NULL
*/
- trace_tcp_send_reset(sk, NULL);
+ trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
}
/* Send a crossed SYN-ACK during socket establishment.
@@ -3857,7 +3896,7 @@ static void tcp_connect_init(struct sock *sk)
tcp_ca_dst_init(sk, dst);
if (!tp->window_clamp)
- tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3865,7 +3904,7 @@ static void tcp_connect_init(struct sock *sk)
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
- tp->window_clamp = tcp_full_space(sk);
+ WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));
rcv_wnd = tcp_rwnd_init_bpf(sk);
if (rcv_wnd == 0)