tcp_output.c 108 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
8
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
 *				:	Fragmentation on mtu decrease
 *				:	Segment collapse on retransmit
 *				:	AF independence
 *
 *		Linus Torvalds	:	send_delayed_ack
 *		David S. Miller	:	Charge memory using the right skb
 *					during syn/ack processing.
 *		David S. Miller :	Output engine completely rewritten.
 *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
 *		Cacophonix Gaul :	draft-minshall-nagle-01
 *		J Hadi Salim	:	ECN support
 *
 */

37
38
#define pr_fmt(fmt) "TCP: " fmt

Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
#include <net/tcp.h>

#include <linux/compiler.h>
42
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
#include <linux/module.h>

/* People can turn this off for buggy TCP's found in printers etc. */
46
int sysctl_tcp_retrans_collapse __read_mostly = 1;
Linus Torvalds's avatar
Linus Torvalds committed
47

48
/* People can turn this on to work with those rare, broken TCPs that
49
50
 * interpret the window field as a signed quantity.
 */
51
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52

53
54
/* Default TSQ limit of four TSO segments */
int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
Eric Dumazet's avatar
Eric Dumazet committed
55

Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59
/* This limits the percentage of the congestion window which we
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
 */
60
int sysctl_tcp_tso_win_divisor __read_mostly = 3;
Linus Torvalds's avatar
Linus Torvalds committed
61

62
/* By default, RFC2861 behavior.  */
63
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
64

Eric Dumazet's avatar
Eric Dumazet committed
65
66
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
			   int push_one, gfp_t gfp);
67

68
/* Account for new data that has been sent to the network. */
69
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
70
{
Nandita Dukkipati's avatar
Nandita Dukkipati committed
71
	struct inet_connection_sock *icsk = inet_csk(sk);
72
	struct tcp_sock *tp = tcp_sk(sk);
73
	unsigned int prior_packets = tp->packets_out;
74

75
	tcp_advance_send_head(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
76
	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77

78
	tp->packets_out += tcp_skb_pcount(skb);
Yuchung Cheng's avatar
Yuchung Cheng committed
79
	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80
		tcp_rearm_rto(sk);
81

82
83
	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
		      tcp_skb_pcount(skb));
Linus Torvalds's avatar
Linus Torvalds committed
84
85
}

86
87
/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
 * window scaling factor due to loss of precision.
Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
91
92
 * If window has been shrunk, what should we make? It is not clear at all.
 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
 * invalid. OK, let's make this for now:
 */
93
static inline __u32 tcp_acceptable_seq(const struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
94
{
95
	const struct tcp_sock *tp = tcp_sk(sk);
96

97
98
99
	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
	    (tp->rx_opt.wscale_ok &&
	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
Linus Torvalds's avatar
Linus Torvalds committed
100
101
		return tp->snd_nxt;
	else
102
		return tcp_wnd_end(tp);
Linus Torvalds's avatar
Linus Torvalds committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
}

/* Calculate mss to advertise in SYN segment.
 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
 *
 * 1. It is independent of path mtu.
 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
 *    attached devices, because some buggy hosts are confused by
 *    large MSS.
 * 4. We do not make 3, we advertise MSS, calculated from first
 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
 *    This may be overridden via information stored in routing table.
 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
 *    probably even Jumbo".
 */
static __u16 tcp_advertise_mss(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
122
	const struct dst_entry *dst = __sk_dst_get(sk);
Linus Torvalds's avatar
Linus Torvalds committed
123
124
	int mss = tp->advmss;

125
126
127
128
129
130
131
	if (dst) {
		unsigned int metric = dst_metric_advmss(dst);

		if (metric < mss) {
			mss = metric;
			tp->advmss = mss;
		}
Linus Torvalds's avatar
Linus Torvalds committed
132
133
134
135
136
137
	}

	return (__u16)mss;
}

/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
138
139
140
 * This is the first part of cwnd validation mechanism.
 */
void tcp_cwnd_restart(struct sock *sk, s32 delta)
Linus Torvalds's avatar
Linus Torvalds committed
141
{
142
	struct tcp_sock *tp = tcp_sk(sk);
143
	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
Linus Torvalds's avatar
Linus Torvalds committed
144
145
	u32 cwnd = tp->snd_cwnd;

146
	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
Linus Torvalds's avatar
Linus Torvalds committed
147

148
	tp->snd_ssthresh = tcp_current_ssthresh(sk);
Linus Torvalds's avatar
Linus Torvalds committed
149
150
	restart_cwnd = min(restart_cwnd, cwnd);

151
	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
Linus Torvalds's avatar
Linus Torvalds committed
152
153
		cwnd >>= 1;
	tp->snd_cwnd = max(cwnd, restart_cwnd);
154
	tp->snd_cwnd_stamp = tcp_jiffies32;
Linus Torvalds's avatar
Linus Torvalds committed
155
156
157
	tp->snd_cwnd_used = 0;
}

158
/* Congestion state accounting after a packet has been sent. */
Stephen Hemminger's avatar
Stephen Hemminger committed
159
static void tcp_event_data_sent(struct tcp_sock *tp,
160
				struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
161
{
162
	struct inet_connection_sock *icsk = inet_csk(sk);
163
	const u32 now = tcp_jiffies32;
Linus Torvalds's avatar
Linus Torvalds committed
164

165
166
167
	if (tcp_packets_in_flight(tp) == 0)
		tcp_ca_event(sk, CA_EVENT_TX_START);

Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
172
	tp->lsndtime = now;

	/* If it is a reply for ato after last received
	 * packet, enter pingpong mode.
	 */
173
174
	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
		icsk->icsk_ack.pingpong = 1;
Linus Torvalds's avatar
Linus Torvalds committed
175
176
}

177
/* Account for an ACK we sent. */
178
179
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
				      u32 rcv_nxt)
Linus Torvalds's avatar
Linus Torvalds committed
180
{
181
182
183
184
	struct tcp_sock *tp = tcp_sk(sk);

	if (unlikely(rcv_nxt != tp->rcv_nxt))
		return;  /* Special ACK sent by DCTCP to reflect ECN */
185
186
	tcp_dec_quickack_mode(sk, pkts);
	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
Linus Torvalds's avatar
Linus Torvalds committed
187
188
}

189
190
191
192

u32 tcp_default_init_rwnd(u32 mss)
{
	/* Initial receive window should be twice of TCP_INIT_CWND to
Weiping Pan's avatar
Weiping Pan committed
193
	 * enable proper sending of new unsent data during fast recovery
194
195
196
197
198
199
200
201
202
203
	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
	 * limit when mss is larger than 1460.
	 */
	u32 init_rwnd = TCP_INIT_CWND * 2;

	if (mss > 1460)
		init_rwnd = max((1460 * init_rwnd) / mss, 2U);
	return init_rwnd;
}

Linus Torvalds's avatar
Linus Torvalds committed
204
205
206
207
208
209
210
211
212
/* Determine a window scaling and initial window to offer.
 * Based on the assumption that the given amount of space
 * will be offered. Store the results in the tp structure.
 * NOTE: for smooth operation initial space offering should
 * be a multiple of mss if possible. We assume here that mss >= 1.
 * This MUST be enforced by all callers.
 */
void tcp_select_initial_window(int __space, __u32 mss,
			       __u32 *rcv_wnd, __u32 *window_clamp,
213
214
			       int wscale_ok, __u8 *rcv_wscale,
			       __u32 init_rcv_wnd)
Linus Torvalds's avatar
Linus Torvalds committed
215
216
217
218
219
{
	unsigned int space = (__space < 0 ? 0 : __space);

	/* If no clamp set the clamp to the max possible scaled window */
	if (*window_clamp == 0)
220
		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
Linus Torvalds's avatar
Linus Torvalds committed
221
222
223
224
	space = min(*window_clamp, space);

	/* Quantize space offering to a multiple of mss if possible. */
	if (space > mss)
225
		space = rounddown(space, mss);
Linus Torvalds's avatar
Linus Torvalds committed
226
227

	/* NOTE: offering an initial window larger than 32767
228
229
230
231
232
233
	 * will break some buggy TCP stacks. If the admin tells us
	 * it is likely we could be speaking with such a buggy stack
	 * we will truncate our initial window offering to 32K-1
	 * unless the remote has sent us a window scaling option,
	 * which we interpret as a sign the remote TCP is not
	 * misinterpreting the window field as a signed quantity.
Linus Torvalds's avatar
Linus Torvalds committed
234
	 */
235
236
237
238
239
	if (sysctl_tcp_workaround_signed_windows)
		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
	else
		(*rcv_wnd) = space;

Linus Torvalds's avatar
Linus Torvalds committed
240
241
	(*rcv_wscale) = 0;
	if (wscale_ok) {
242
		/* Set window scaling on max possible window */
243
244
		space = max_t(u32, space, sysctl_tcp_rmem[2]);
		space = max_t(u32, space, sysctl_rmem_max);
245
		space = min_t(u32, space, *window_clamp);
246
		while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
Linus Torvalds's avatar
Linus Torvalds committed
247
248
249
250
251
			space >>= 1;
			(*rcv_wscale)++;
		}
	}

252
	if (mss > (1 << *rcv_wscale)) {
253
254
255
		if (!init_rcv_wnd) /* Use default unless specified otherwise */
			init_rcv_wnd = tcp_default_init_rwnd(mss);
		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
Linus Torvalds's avatar
Linus Torvalds committed
256
257
258
	}

	/* Set the clamp no higher than max representable value */
259
	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
Linus Torvalds's avatar
Linus Torvalds committed
260
}
261
EXPORT_SYMBOL(tcp_select_initial_window);
Linus Torvalds's avatar
Linus Torvalds committed
262
263
264
265
266
267

/* Chose a new window to advertise, update state in tcp_sock for the
 * socket, and return result with RFC1323 scaling applied.  The return
 * value can be stuffed directly into th->window for an outgoing
 * frame.
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
268
static u16 tcp_select_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
269
270
{
	struct tcp_sock *tp = tcp_sk(sk);
271
	u32 old_win = tp->rcv_wnd;
Linus Torvalds's avatar
Linus Torvalds committed
272
273
274
275
	u32 cur_win = tcp_receive_window(tp);
	u32 new_win = __tcp_select_window(sk);

	/* Never shrink the offered window */
Stephen Hemminger's avatar
Stephen Hemminger committed
276
	if (new_win < cur_win) {
Linus Torvalds's avatar
Linus Torvalds committed
277
278
279
280
281
282
283
		/* Danger Will Robinson!
		 * Don't update rcv_wup/rcv_wnd here or else
		 * we will not be able to advertise a zero
		 * window in time.  --DaveM
		 *
		 * Relax Will Robinson.
		 */
284
285
286
		if (new_win == 0)
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPWANTZEROWINDOWADV);
287
		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
Linus Torvalds's avatar
Linus Torvalds committed
288
289
290
291
292
293
294
	}
	tp->rcv_wnd = new_win;
	tp->rcv_wup = tp->rcv_nxt;

	/* Make sure we do not exceed the maximum possible
	 * scaled window.
	 */
295
	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
Linus Torvalds's avatar
Linus Torvalds committed
296
297
298
299
300
301
302
		new_win = min(new_win, MAX_TCP_WINDOW);
	else
		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));

	/* RFC1323 scaling applied */
	new_win >>= tp->rx_opt.rcv_wscale;

303
	/* If we advertise zero window, disable fast path. */
304
	if (new_win == 0) {
305
		tp->pred_flags = 0;
306
307
308
309
310
311
		if (old_win)
			NET_INC_STATS(sock_net(sk),
				      LINUX_MIB_TCPTOZEROWINDOWADV);
	} else if (old_win == 0) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
	}
Linus Torvalds's avatar
Linus Torvalds committed
312
313
314
315

	return new_win;
}

316
/* Packet ECN state for a SYN-ACK */
317
static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
318
{
319
320
	const struct tcp_sock *tp = tcp_sk(sk);

Eric Dumazet's avatar
Eric Dumazet committed
321
	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
322
	if (!(tp->ecn_flags & TCP_ECN_OK))
Eric Dumazet's avatar
Eric Dumazet committed
323
		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
324
325
	else if (tcp_ca_needs_ecn(sk) ||
		 tcp_bpf_ca_needs_ecn(sk))
326
		INET_ECN_xmit(sk);
327
328
}

329
/* Packet ECN state for a SYN.  */
330
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
331
332
{
	struct tcp_sock *tp = tcp_sk(sk);
333
	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
334
	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
335
		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
336
337
338
339
340
341
342

	if (!use_ecn) {
		const struct dst_entry *dst = __sk_dst_get(sk);

		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
			use_ecn = true;
	}
343
344

	tp->ecn_flags = 0;
345
346

	if (use_ecn) {
Eric Dumazet's avatar
Eric Dumazet committed
347
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
348
		tp->ecn_flags = TCP_ECN_OK;
349
		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
350
			INET_ECN_xmit(sk);
351
352
353
	}
}

354
355
356
357
358
359
360
361
362
static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
		/* tp->ecn_flags are cleared at a later point in time when
		 * SYN ACK is ultimatively being received.
		 */
		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
}

363
static void
364
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
365
{
366
	if (inet_rsk(req)->ecn_ok)
367
368
369
		th->ece = 1;
}

370
371
372
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
 * be sent.
 */
373
static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
374
			 struct tcphdr *th, int tcp_header_len)
375
376
377
378
379
380
381
382
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (tp->ecn_flags & TCP_ECN_OK) {
		/* Not-retransmitted data segment: set ECT and inject CWR. */
		if (skb->len != tcp_header_len &&
		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
			INET_ECN_xmit(sk);
383
			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
384
				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
385
				th->cwr = 1;
386
387
				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
			}
388
		} else if (!tcp_ca_needs_ecn(sk)) {
389
390
391
392
			/* ACK or retransmitted segment: clear ECT|CE */
			INET_ECN_dontxmit(sk);
		}
		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
393
			th->ece = 1;
394
395
396
	}
}

397
398
399
400
401
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
 * auto increment end seqno.
 */
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
402
	skb->ip_summed = CHECKSUM_PARTIAL;
403
404
	skb->csum = 0;

Eric Dumazet's avatar
Eric Dumazet committed
405
	TCP_SKB_CB(skb)->tcp_flags = flags;
406
407
	TCP_SKB_CB(skb)->sacked = 0;

408
	tcp_skb_pcount_set(skb, 1);
409
410

	TCP_SKB_CB(skb)->seq = seq;
Changli Gao's avatar
Changli Gao committed
411
	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
412
413
414
415
		seq++;
	TCP_SKB_CB(skb)->end_seq = seq;
}

Eric Dumazet's avatar
Eric Dumazet committed
416
static inline bool tcp_urg_mode(const struct tcp_sock *tp)
Ilpo Järvinen's avatar
Ilpo Järvinen committed
417
418
419
420
{
	return tp->snd_una != tp->snd_up;
}

Adam Langley's avatar
Adam Langley committed
421
422
423
#define OPTION_SACK_ADVERTISE	(1 << 0)
#define OPTION_TS		(1 << 1)
#define OPTION_MD5		(1 << 2)
424
#define OPTION_WSCALE		(1 << 3)
Yuchung Cheng's avatar
Yuchung Cheng committed
425
#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
Adam Langley's avatar
Adam Langley committed
426
427

struct tcp_out_options {
Yuchung Cheng's avatar
Yuchung Cheng committed
428
429
	u16 options;		/* bit field of OPTION_* */
	u16 mss;		/* 0 to disable */
Adam Langley's avatar
Adam Langley committed
430
431
	u8 ws;			/* window scale, 0 to disable */
	u8 num_sack_blocks;	/* number of SACK blocks to include */
432
433
	u8 hash_size;		/* bytes in hash_location */
	__u8 *hash_location;	/* temporary pointer, overloaded */
Yuchung Cheng's avatar
Yuchung Cheng committed
434
435
	__u32 tsval, tsecr;	/* need to include OPTION_TS */
	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
Adam Langley's avatar
Adam Langley committed
436
437
};

438
439
440
/* Write previously computed TCP options to the packet.
 *
 * Beware: Something in the Internet is very sensitive to the ordering of
441
442
 * TCP options, we learned this through the hard way, so be careful here.
 * Luckily we can at least blame others for their non-compliance but from
stephen hemminger's avatar
stephen hemminger committed
443
 * inter-operability perspective it seems that we're somewhat stuck with
444
445
446
447
448
449
450
 * the ordering which we have been using if we want to keep working with
 * those broken things (not that it currently hurts anybody as there isn't
 * particular reason why the ordering would need to be changed).
 *
 * At least SACK_PERM as the first option is known to lead to a disaster
 * (but it may well be that other scenarios fail similarly).
 */
Adam Langley's avatar
Adam Langley committed
451
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
452
453
			      struct tcp_out_options *opts)
{
Yuchung Cheng's avatar
Yuchung Cheng committed
454
	u16 options = opts->options;	/* mungable copy */
455
456

	if (unlikely(OPTION_MD5 & options)) {
Christoph Paasch's avatar
Christoph Paasch committed
457
458
		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
459
460
		/* overload cookie hash location */
		opts->hash_location = (__u8 *)ptr;
Adam Langley's avatar
Adam Langley committed
461
		ptr += 4;
Stephen Hemminger's avatar
Stephen Hemminger committed
462
	}
Adam Langley's avatar
Adam Langley committed
463

464
465
466
467
468
469
	if (unlikely(opts->mss)) {
		*ptr++ = htonl((TCPOPT_MSS << 24) |
			       (TCPOLEN_MSS << 16) |
			       opts->mss);
	}

470
471
	if (likely(OPTION_TS & options)) {
		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
Adam Langley's avatar
Adam Langley committed
472
473
474
475
			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
				       (TCPOLEN_SACK_PERM << 16) |
				       (TCPOPT_TIMESTAMP << 8) |
				       TCPOLEN_TIMESTAMP);
476
			options &= ~OPTION_SACK_ADVERTISE;
Adam Langley's avatar
Adam Langley committed
477
478
479
480
481
482
483
484
485
486
		} else {
			*ptr++ = htonl((TCPOPT_NOP << 24) |
				       (TCPOPT_NOP << 16) |
				       (TCPOPT_TIMESTAMP << 8) |
				       TCPOLEN_TIMESTAMP);
		}
		*ptr++ = htonl(opts->tsval);
		*ptr++ = htonl(opts->tsecr);
	}

487
	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
Adam Langley's avatar
Adam Langley committed
488
489
490
491
492
493
		*ptr++ = htonl((TCPOPT_NOP << 24) |
			       (TCPOPT_NOP << 16) |
			       (TCPOPT_SACK_PERM << 8) |
			       TCPOLEN_SACK_PERM);
	}

494
	if (unlikely(OPTION_WSCALE & options)) {
Adam Langley's avatar
Adam Langley committed
495
496
497
498
499
500
501
502
503
		*ptr++ = htonl((TCPOPT_NOP << 24) |
			       (TCPOPT_WINDOW << 16) |
			       (TCPOLEN_WINDOW << 8) |
			       opts->ws);
	}

	if (unlikely(opts->num_sack_blocks)) {
		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
			tp->duplicate_sack : tp->selective_acks;
Stephen Hemminger's avatar
Stephen Hemminger committed
504
505
506
507
508
		int this_sack;

		*ptr++ = htonl((TCPOPT_NOP  << 24) |
			       (TCPOPT_NOP  << 16) |
			       (TCPOPT_SACK <<  8) |
Adam Langley's avatar
Adam Langley committed
509
			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
Stephen Hemminger's avatar
Stephen Hemminger committed
510
						     TCPOLEN_SACK_PERBLOCK)));
Stephen Hemminger's avatar
Stephen Hemminger committed
511

Adam Langley's avatar
Adam Langley committed
512
513
		for (this_sack = 0; this_sack < opts->num_sack_blocks;
		     ++this_sack) {
Stephen Hemminger's avatar
Stephen Hemminger committed
514
515
516
			*ptr++ = htonl(sp[this_sack].start_seq);
			*ptr++ = htonl(sp[this_sack].end_seq);
		}
Stephen Hemminger's avatar
Stephen Hemminger committed
517

518
		tp->rx_opt.dsack = 0;
Stephen Hemminger's avatar
Stephen Hemminger committed
519
	}
Yuchung Cheng's avatar
Yuchung Cheng committed
520
521
522

	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
523
524
525
526
527
528
529
530
531
532
533
534
535
		u8 *p = (u8 *)ptr;
		u32 len; /* Fast Open option length */

		if (foc->exp) {
			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
				     TCPOPT_FASTOPEN_MAGIC);
			p += TCPOLEN_EXP_FASTOPEN_BASE;
		} else {
			len = TCPOLEN_FASTOPEN_BASE + foc->len;
			*p++ = TCPOPT_FASTOPEN;
			*p++ = len;
		}
Yuchung Cheng's avatar
Yuchung Cheng committed
536

537
538
539
540
		memcpy(p, foc->val, foc->len);
		if ((len & 3) == 2) {
			p[foc->len] = TCPOPT_NOP;
			p[foc->len + 1] = TCPOPT_NOP;
Yuchung Cheng's avatar
Yuchung Cheng committed
541
		}
542
		ptr += (len + 3) >> 2;
Yuchung Cheng's avatar
Yuchung Cheng committed
543
	}
Adam Langley's avatar
Adam Langley committed
544
545
}

546
547
548
/* Compute TCP options for SYN packets. This is not the final
 * network wire format yet.
 */
549
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
Adam Langley's avatar
Adam Langley committed
550
				struct tcp_out_options *opts,
551
552
				struct tcp_md5sig_key **md5)
{
Adam Langley's avatar
Adam Langley committed
553
	struct tcp_sock *tp = tcp_sk(sk);
554
	unsigned int remaining = MAX_TCP_OPTION_SPACE;
555
	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
Adam Langley's avatar
Adam Langley committed
556

557
#ifdef CONFIG_TCP_MD5SIG
Adam Langley's avatar
Adam Langley committed
558
559
560
	*md5 = tp->af_specific->md5_lookup(sk, sk);
	if (*md5) {
		opts->options |= OPTION_MD5;
561
		remaining -= TCPOLEN_MD5SIG_ALIGNED;
562
	}
Adam Langley's avatar
Adam Langley committed
563
564
#else
	*md5 = NULL;
565
#endif
Adam Langley's avatar
Adam Langley committed
566
567
568
569
570
571
572
573
574
575
576

	/* We always get an MSS option.  The option bytes which will be seen in
	 * normal data packets should timestamps be used, must be in the MSS
	 * advertised.  But we subtract them from tp->mss_cache so that
	 * calculations in tcp_sendmsg are simpler etc.  So account for this
	 * fact here if necessary.  If we don't do this correctly, as a
	 * receiver we won't recognize data packets as being full sized when we
	 * should, and thus we won't abide by the delayed ACK rules correctly.
	 * SACKs don't matter, we never delay an ACK when we have any of those
	 * going out.  */
	opts->mss = tcp_advertise_mss(sk);
577
	remaining -= TCPOLEN_MSS_ALIGNED;
Adam Langley's avatar
Adam Langley committed
578

579
	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
Adam Langley's avatar
Adam Langley committed
580
		opts->options |= OPTION_TS;
581
		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
Adam Langley's avatar
Adam Langley committed
582
		opts->tsecr = tp->rx_opt.ts_recent;
583
		remaining -= TCPOLEN_TSTAMP_ALIGNED;
Adam Langley's avatar
Adam Langley committed
584
	}
585
	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
Adam Langley's avatar
Adam Langley committed
586
		opts->ws = tp->rx_opt.rcv_wscale;
587
		opts->options |= OPTION_WSCALE;
588
		remaining -= TCPOLEN_WSCALE_ALIGNED;
Adam Langley's avatar
Adam Langley committed
589
	}
590
	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
Adam Langley's avatar
Adam Langley committed
591
		opts->options |= OPTION_SACK_ADVERTISE;
592
		if (unlikely(!(OPTION_TS & opts->options)))
593
			remaining -= TCPOLEN_SACKPERM_ALIGNED;
Adam Langley's avatar
Adam Langley committed
594
595
	}

596
	if (fastopen && fastopen->cookie.len >= 0) {
597
598
599
600
		u32 need = fastopen->cookie.len;

		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
					       TCPOLEN_FASTOPEN_BASE;
601
602
603
604
605
606
		need = (need + 3) & ~3U;  /* Align to 32 bits */
		if (remaining >= need) {
			opts->options |= OPTION_FAST_OPEN_COOKIE;
			opts->fastopen_cookie = &fastopen->cookie;
			remaining -= need;
			tp->syn_fastopen = 1;
607
			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
608
609
		}
	}
610
611

	return MAX_TCP_OPTION_SPACE - remaining;
Stephen Hemminger's avatar
Stephen Hemminger committed
612
613
}

614
/* Set up TCP options for SYN-ACKs. */
615
616
617
618
619
static unsigned int tcp_synack_options(struct request_sock *req,
				       unsigned int mss, struct sk_buff *skb,
				       struct tcp_out_options *opts,
				       const struct tcp_md5sig_key *md5,
				       struct tcp_fastopen_cookie *foc)
620
{
Adam Langley's avatar
Adam Langley committed
621
	struct inet_request_sock *ireq = inet_rsk(req);
622
	unsigned int remaining = MAX_TCP_OPTION_SPACE;
Adam Langley's avatar
Adam Langley committed
623

624
#ifdef CONFIG_TCP_MD5SIG
Eric Dumazet's avatar
Eric Dumazet committed
625
	if (md5) {
Adam Langley's avatar
Adam Langley committed
626
		opts->options |= OPTION_MD5;
627
628
629
630
631
632
633
		remaining -= TCPOLEN_MD5SIG_ALIGNED;

		/* We can't fit any SACK blocks in a packet with MD5 + TS
		 * options. There was discussion about disabling SACK
		 * rather than TS in order to fit in better with old,
		 * buggy kernels, but that was deemed to be unnecessary.
		 */
634
		ireq->tstamp_ok &= !ireq->sack_ok;
635
636
	}
#endif
Adam Langley's avatar
Adam Langley committed
637

638
	/* We always send an MSS option. */
Adam Langley's avatar
Adam Langley committed
639
	opts->mss = mss;
640
	remaining -= TCPOLEN_MSS_ALIGNED;
Adam Langley's avatar
Adam Langley committed
641
642
643

	if (likely(ireq->wscale_ok)) {
		opts->ws = ireq->rcv_wscale;
644
		opts->options |= OPTION_WSCALE;
645
		remaining -= TCPOLEN_WSCALE_ALIGNED;
Adam Langley's avatar
Adam Langley committed
646
	}
647
	if (likely(ireq->tstamp_ok)) {
Adam Langley's avatar
Adam Langley committed
648
		opts->options |= OPTION_TS;
649
		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
Adam Langley's avatar
Adam Langley committed
650
		opts->tsecr = req->ts_recent;
651
		remaining -= TCPOLEN_TSTAMP_ALIGNED;
Adam Langley's avatar
Adam Langley committed
652
653
654
	}
	if (likely(ireq->sack_ok)) {
		opts->options |= OPTION_SACK_ADVERTISE;
655
		if (unlikely(!ireq->tstamp_ok))
656
			remaining -= TCPOLEN_SACKPERM_ALIGNED;
Adam Langley's avatar
Adam Langley committed
657
	}
658
659
660
661
662
	if (foc != NULL && foc->len >= 0) {
		u32 need = foc->len;

		need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				   TCPOLEN_FASTOPEN_BASE;
663
664
665
666
667
668
669
		need = (need + 3) & ~3U;  /* Align to 32 bits */
		if (remaining >= need) {
			opts->options |= OPTION_FAST_OPEN_COOKIE;
			opts->fastopen_cookie = foc;
			remaining -= need;
		}
	}
Christoph Paasch's avatar
Christoph Paasch committed
670

671
	return MAX_TCP_OPTION_SPACE - remaining;
Adam Langley's avatar
Adam Langley committed
672
673
}

674
675
676
/* Compute TCP options for ESTABLISHED sockets. This is not the
 * final wire format yet.
 */
677
static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
Adam Langley's avatar
Adam Langley committed
678
					struct tcp_out_options *opts,
679
680
					struct tcp_md5sig_key **md5)
{
Adam Langley's avatar
Adam Langley committed
681
	struct tcp_sock *tp = tcp_sk(sk);
682
	unsigned int size = 0;
683
	unsigned int eff_sacks;
Adam Langley's avatar
Adam Langley committed
684

685
686
	opts->options = 0;

Adam Langley's avatar
Adam Langley committed
687
688
689
690
691
692
693
694
695
696
697
698
#ifdef CONFIG_TCP_MD5SIG
	*md5 = tp->af_specific->md5_lookup(sk, sk);
	if (unlikely(*md5)) {
		opts->options |= OPTION_MD5;
		size += TCPOLEN_MD5SIG_ALIGNED;
	}
#else
	*md5 = NULL;
#endif

	if (likely(tp->rx_opt.tstamp_ok)) {
		opts->options |= OPTION_TS;
699
		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
Adam Langley's avatar
Adam Langley committed
700
701
702
703
		opts->tsecr = tp->rx_opt.ts_recent;
		size += TCPOLEN_TSTAMP_ALIGNED;
	}

704
705
	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
	if (unlikely(eff_sacks)) {
706
		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
Adam Langley's avatar
Adam Langley committed
707
		opts->num_sack_blocks =
708
			min_t(unsigned int, eff_sacks,
Adam Langley's avatar
Adam Langley committed
709
710
711
712
713
714
715
			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
			      TCPOLEN_SACK_PERBLOCK);
		size += TCPOLEN_SACK_BASE_ALIGNED +
			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	}

	return size;
Stephen Hemminger's avatar
Stephen Hemminger committed
716
}
Linus Torvalds's avatar
Linus Torvalds committed
717

Eric Dumazet's avatar
Eric Dumazet committed
718
719
720
721
722
723
724
725
726

/* TCP SMALL QUEUES (TSQ)
 *
 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
 * to reduce RTT and bufferbloat.
 * We do this using a special skb destructor (tcp_wfree).
 *
 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
 * needs to be reallocated in a driver.
stephen hemminger's avatar
stephen hemminger committed
727
 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
Eric Dumazet's avatar
Eric Dumazet committed
728
729
730
731
732
733
734
735
736
737
738
 *
 * Since transmit from skb destructor is forbidden, we use a tasklet
 * to process all sockets that eventually need to send more skbs.
 * We use one tasklet per cpu, with its own queue of sockets.
 */
struct tsq_tasklet {
	struct tasklet_struct	tasklet;
	struct list_head	head; /* queue of tcp sockets */
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);

739
740
741
742
static void tcp_tsq_handler(struct sock *sk)
{
	if ((1 << sk->sk_state) &
	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
743
744
745
746
	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
		struct tcp_sock *tp = tcp_sk(sk);

		if (tp->lost_out > tp->retrans_out &&
747
748
		    tp->snd_cwnd > tcp_packets_in_flight(tp)) {
			tcp_mstamp_refresh(tp);
749
			tcp_xmit_retransmit_queue(sk);
750
		}
751
752

		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
John Ogness's avatar
John Ogness committed
753
			       0, GFP_ATOMIC);
754
	}
755
}
Eric Dumazet's avatar
Eric Dumazet committed
756
/*
stephen hemminger's avatar
stephen hemminger committed
757
 * One tasklet per cpu tries to send more skbs.
Eric Dumazet's avatar
Eric Dumazet committed
758
 * We run in tasklet context but need to disable irqs when
stephen hemminger's avatar
stephen hemminger committed
759
 * transferring tsq->head because tcp_wfree() might
Eric Dumazet's avatar
Eric Dumazet committed
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
 * interrupt us (non NAPI drivers)
 */
static void tcp_tasklet_func(unsigned long data)
{
	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
	LIST_HEAD(list);
	unsigned long flags;
	struct list_head *q, *n;
	struct tcp_sock *tp;
	struct sock *sk;

	local_irq_save(flags);
	list_splice_init(&tsq->head, &list);
	local_irq_restore(flags);

	list_for_each_safe(q, n, &list) {
		tp = list_entry(q, struct tcp_sock, tsq_node);
		list_del(&tp->tsq_node);

		sk = (struct sock *)tp;
780
		smp_mb__before_atomic();
781
782
		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);

783
		if (!sk->sk_lock.owned &&
784
		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
785
786
			bh_lock_sock(sk);
			if (!sock_owned_by_user(sk)) {
787
				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
788
789
790
				tcp_tsq_handler(sk);
			}
			bh_unlock_sock(sk);
Eric Dumazet's avatar
Eric Dumazet committed
791
792
793
794
795
796
		}

		sk_free(sk);
	}
}

797
798
799
800
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
			  TCPF_WRITE_TIMER_DEFERRED |	\
			  TCPF_DELACK_TIMER_DEFERRED |	\
			  TCPF_MTU_REDUCED_DEFERRED)
Eric Dumazet's avatar
Eric Dumazet committed
801
802
803
804
805
806
807
808
809
/**
 * tcp_release_cb - tcp release_sock() callback
 * @sk: socket
 *
 * called from release_sock() to perform protocol dependent
 * actions before socket release.
 */
void tcp_release_cb(struct sock *sk)
{
810
	unsigned long flags, nflags;
Eric Dumazet's avatar
Eric Dumazet committed
811

812
813
	/* perform an atomic operation only if at least one flag is set */
	do {
814
		flags = sk->sk_tsq_flags;
815
816
817
		if (!(flags & TCP_DEFERRED_ALL))
			return;
		nflags = flags & ~TCP_DEFERRED_ALL;
818
	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
819

820
	if (flags & TCPF_TSQ_DEFERRED)
821
822
		tcp_tsq_handler(sk);

823
824
825
826
827
828
829
830
831
832
833
	/* Here begins the tricky part :
	 * We are called from release_sock() with :
	 * 1) BH disabled
	 * 2) sk_lock.slock spinlock held
	 * 3) socket owned by us (sk->sk_lock.owned == 1)
	 *
	 * But following code is meant to be called from BH handlers,
	 * so we should keep BH disabled, but early release socket ownership
	 */
	sock_release_ownership(sk);

834
	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
835
		tcp_write_timer_handler(sk);
836
837
		__sock_put(sk);
	}
838
	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
839
		tcp_delack_timer_handler(sk);
840
841
		__sock_put(sk);
	}
842
	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
843
		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
844
845
		__sock_put(sk);
	}
Eric Dumazet's avatar
Eric Dumazet committed
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
}
EXPORT_SYMBOL(tcp_release_cb);

void __init tcp_tasklet_init(void)
{
	int i;

	for_each_possible_cpu(i) {
		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);

		INIT_LIST_HEAD(&tsq->head);
		tasklet_init(&tsq->tasklet,
			     tcp_tasklet_func,
			     (unsigned long)tsq);
	}
}

/*
 * Write buffer destructor automatically called from kfree_skb.
stephen hemminger's avatar
stephen hemminger committed
865
 * We can't xmit new skbs from this context, as we might already
Eric Dumazet's avatar
Eric Dumazet committed
866
867
 * hold qdisc lock.
 */
Eric Dumazet's avatar
Eric Dumazet committed
868
void tcp_wfree(struct sk_buff *skb)
Eric Dumazet's avatar
Eric Dumazet committed
869
870
871
{
	struct sock *sk = skb->sk;
	struct tcp_sock *tp = tcp_sk(sk);
872
	unsigned long flags, nval, oval;
873
874
875
876

	/* Keep one reference on sk_wmem_alloc.
	 * Will be released by sk_free() from here or tcp_tasklet_func()
	 */
877
	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
878
879
880
881
882
883
884
885

	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
	 * Wait until our queues (qdisc + devices) are drained.
	 * This gives :
	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
	 * - chance for incoming ACK (processed by another cpu maybe)
	 *   to migrate this flow (skb->ooo_okay will be eventually set)
	 */
886
	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
887
		goto out;
Eric Dumazet's avatar
Eric Dumazet committed
888

889
	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
Eric Dumazet's avatar
Eric Dumazet committed
890
		struct tsq_tasklet *tsq;
891
		bool empty;
Eric Dumazet's avatar
Eric Dumazet committed
892

893
894
895
		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
			goto out;

896
		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
897
		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
898
899
900
		if (nval != oval)
			continue;

Eric Dumazet's avatar
Eric Dumazet committed
901
902
		/* queue this socket to tasklet queue */
		local_irq_save(flags);
903
		tsq = this_cpu_ptr(&tsq_tasklet);
904
		empty = list_empty(&tsq->head);
Eric Dumazet's avatar
Eric Dumazet committed
905
		list_add(&tp->tsq_node, &tsq->head);
906
907
		if (empty)
			tasklet_schedule(&tsq->tasklet);
Eric Dumazet's avatar
Eric Dumazet committed
908
		local_irq_restore(flags);
909
		return;
Eric Dumazet's avatar
Eric Dumazet committed
910
	}
911
912
out:
	sk_free(sk);
Eric Dumazet's avatar
Eric Dumazet committed
913
914
}

915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
/* Note: Called under hard irq.
 * We can not call TCP stack right away.
 */
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{
	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
	struct sock *sk = (struct sock *)tp;
	unsigned long nval, oval;

	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
		struct tsq_tasklet *tsq;
		bool empty;

		if (oval & TSQF_QUEUED)
			break;

		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
		if (nval != oval)
			continue;

936
		if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
			break;
		/* queue this socket to tasklet queue */
		tsq = this_cpu_ptr(&tsq_tasklet);
		empty = list_empty(&tsq->head);
		list_add(&tp->tsq_node, &tsq->head);
		if (empty)
			tasklet_schedule(&tsq->tasklet);
		break;
	}
	return HRTIMER_NORESTART;
}

/* BBR congestion control needs pacing.
 * Same remark for SO_MAX_PACING_RATE.
 * sch_fq packet scheduler is efficiently handling pacing,
 * but is not always installed/used.
 * Return true if TCP stack should pace packets itself.
 */
static bool tcp_needs_internal_pacing(const struct sock *sk)
{
	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
}

static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
{
	u64 len_ns;
	u32 rate;

	if (!tcp_needs_internal_pacing(sk))
		return;
	rate = sk->sk_pacing_rate;
	if (!rate || rate == ~0U)
		return;

	/* Should account for header sizes as sch_fq does,
	 * but lets make things simple.
	 */
	len_ns = (u64)skb->len * NSEC_PER_SEC;
	do_div(len_ns, rate);
	hrtimer_start(&tcp_sk(sk)->pacing_timer,
		      ktime_add_ns(ktime_get(), len_ns),
		      HRTIMER_MODE_ABS_PINNED);
}

Linus Torvalds's avatar
Linus Torvalds committed
981
982
983
984
985
986
987
988
989
990
991
/* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
 * All SKB's seen here are completely headerless.  It is our
 * job to build the TCP header, and pass the packet down to
 * IP so it can do the same plus pass the packet off to the
 * device.
 *
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 */
992
993
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
Linus Torvalds's avatar
Linus Torvalds committed
994
{
995
996
997
998
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
Adam Langley's avatar
Adam Langley committed
999
	struct tcp_out_options opts;
1000
	unsigned int tcp_options_size, tcp_header_size;
1001
	struct sk_buff *oskb = NULL;
1002
	struct tcp_md5sig_key *md5;
1003
1004
1005
1006
	struct tcphdr *th;
	int err;

	BUG_ON(!skb || !tcp_skb_pcount(skb));
1007
	tp = tcp_sk(sk);
1008

1009
	if (clone_it) {
1010
1011
		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
			- tp->snd_una;
1012
		oskb = skb;
1013
1014
1015
1016
1017
1018
1019
		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}
1020
	skb->skb_mstamp = tp->tcp_mstamp;
Linus Torvalds's avatar
Linus Torvalds committed
1021

1022
1023
	inet = inet_sk(sk);
	tcb = TCP_SKB_CB(skb);
Adam Langley's avatar
Adam Langley committed
1024
	memset(&opts, 0, sizeof(opts));
Linus Torvalds's avatar
Linus Torvalds committed
1025

Eric Dumazet's avatar
Eric Dumazet committed
1026
	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
Adam Langley's avatar
Adam Langley committed
1027
1028
1029
1030
1031
		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
	else
		tcp_options_size = tcp_established_options(sk, skb, &opts,
							   &md5);
	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1032

Eric Dumazet's avatar
Eric Dumazet committed
1033
	/* if no packet is in qdisc/device queue, then allow XPS to select
1034
1035
1036
1037
1038
	 * another queue. We can be called from tcp_tsq_handler()
	 * which holds one reference to sk_wmem_alloc.
	 *
	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
	 * One way to get this would be to set skb->truesize = 2 on them.
Eric Dumazet's avatar
Eric Dumazet committed
1039
	 */
1040
	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1041

1042
1043
1044
1045
1046
1047
1048
	/* If we had to use memory reserve to allocate this skb,
	 * this might cause drops if packet is looped back :
	 * Other socket might not have SOCK_MEMALLOC.
	 * Packets not looped back do not care about pfmemalloc.
	 */
	skb->pfmemalloc = 0;

1049
1050
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);
Eric Dumazet's avatar
Eric Dumazet committed
1051
1052
1053

	skb_orphan(skb);
	skb->sk = sk;
Eric Dumazet's avatar
Eric Dumazet committed
1054
	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1055
	skb_set_hash_from_sk(skb, sk);
1056
	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1057

1058
1059
	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);

1060
	/* Build TCP header and checksum it. */
1061
	th = (struct tcphdr *)skb->data;
1062
1063
	th->source		= inet->inet_sport;
	th->dest		= inet->inet_dport;
1064
	th->seq			= htonl(tcb->seq);
1065
	th->ack_seq		= htonl(rcv_nxt);
1066
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
Eric Dumazet's avatar
Eric Dumazet committed
1067
					tcb->tcp_flags);
1068
1069
1070

	th->check		= 0;
	th->urg_ptr		= 0;
Linus Torvalds's avatar
Linus Torvalds committed
1071

Ilpo Järvinen's avatar
Ilpo Järvinen committed
1072
	/* The urg_mode check is necessary during a below snd_una win probe */
1073
1074
1075
1076
1077
	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
		if (before(tp->snd_up, tcb->seq + 0x10000)) {
			th->urg_ptr = htons(tp->snd_up - tcb->seq);
			th->urg = 1;
		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1078
			th->urg_ptr = htons(0xFFFF);
1079
1080
			th->urg = 1;
		}
1081
	}
Linus Torvalds's avatar
Linus Torvalds committed
1082