-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathnewcwv-3.18.3.patch
370 lines (353 loc) · 11.3 KB
/
newcwv-3.18.3.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c2dee7d..a06896e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -315,6 +315,19 @@ struct tcp_sock {
* socket. Used to retransmit SYNACKs etc.
*/
struct request_sock *fastopen_rsk;
+
+/* TCP newcwv related information */
+ u32 psample[4]; /* pipeACK samples circular buffer */
+ u32 time_stamp[4];
+ u32 head; /* index for psample buffer */
+
+ u32 pipeack; /* pipeACK value after filtering */
+ u32 loss_flight_size; /* flightsize at loss detection */
+ u32 prior_retrans; /* Retransmission before going into FR */
+ u32 prev_snd_una; /* snd_una of last record */
+ u32 prev_snd_nxt; /* snd_nxt of last record */
+ u32 cwnd_valid_ts; /* last time cwnd was found valid */
+
};
enum tsq_flags {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4062b4f..554909e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -228,6 +228,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TFO_SERVER_WO_SOCKOPT1 0x400
#define TFO_SERVER_WO_SOCKOPT2 0x800
+/* NewCWV defaults */
+#define NCWV_UNDEF 0xFFFFFFFF
+#define NCWV_FIVEMINS (HZ*300)
+
extern struct inet_timewait_death_row tcp_death_row;
/* sysctl variables for tcp */
@@ -276,6 +280,7 @@ extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking;
+extern int sysctl_tcp_newcwv;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -995,28 +1000,20 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
return tp->snd_una + tp->snd_wnd;
}
-/* We follow the spirit of RFC2861 to validate cwnd but implement a more
- * flexible approach. The RFC suggests cwnd should not be raised unless
- * it was fully used previously. And that's exactly what we do in
- * congestion avoidance mode. But in slow start we allow cwnd to grow
- * as long as the application has used half the cwnd.
- * Example :
- * cwnd is 10 (IW10), but application sends 9 frames.
- * We allow cwnd to reach 18 when all frames are ACKed.
- * This check is safe because it's as aggressive as slow start which already
- * risks 100% overshoot. The advantage is that we discourage application to
- * either send more filler packets or data to artificially blow up the cwnd
- * usage, and allow application-limited process to probe bw more aggressively.
- */
-static inline bool tcp_is_cwnd_limited(const struct sock *sk)
+static inline bool tcp_is_cwnd_limited(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- /* If in slow start, ensure cwnd grows to twice what was ACKed. */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- return tp->snd_cwnd < 2 * tp->max_packets_out;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* draft-ietf-tcpm-newcwv relaxes conditions for growing cwnd */
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+ (sysctl_tcp_newcwv &&
+ tp->pipeack >= (tp->snd_cwnd*tp->mss_cache >> 1))) {
+ tp->cwnd_valid_ts = tcp_time_stamp;
+ return true;
+ }
- return tp->is_cwnd_limited;
+ return false;
}
static inline void tcp_check_probe_timer(struct sock *sk)
@@ -1366,6 +1363,13 @@ struct tcp_fastopen_context {
struct rcu_head rcu;
};
+/* TCP New CWV functions */
+void tcp_newcwv_update_pipeack(struct sock* sk);
+void tcp_newcwv_datalim_closedown(struct sock* sk);
+void tcp_newcwv_enter_recovery(struct sock* sk);
+void tcp_newcwv_end_recovery(struct sock* sk);
+void tcp_newcwv_reset(struct sock* sk);
+
/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 518c04e..77d3ed5 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
- fib_frontend.o fib_semantics.o fib_trie.o \
+ fib_frontend.o fib_semantics.o fib_trie.o tcp_newcwv.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b3c53c8..ca686ac 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -489,6 +489,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_newcwv",
+ .data = &sysctl_tcp_newcwv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index b1c5970..980122b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -103,6 +103,9 @@ void tcp_init_congestion_control(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ /* draft-ietf-tcpm-newcwv initial state */
+ tcp_newcwv_reset(sk);
+
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
}
@@ -282,6 +285,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
+
/* Slow start is used when congestion window is no greater than the slow start
* threshold. We base on RFC2581 and also handle stretch ACKs properly.
* We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d107ee2..47afa26 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1940,6 +1940,7 @@ void tcp_enter_loss(struct sock *sk)
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tcp_newcwv_reset(sk);
tp->retrans_out = 0;
tp->lost_out = 0;
@@ -2539,7 +2540,16 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
(tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ /* draft-ietf-tcpm-newcwv actions at the end of recovery */
+ if (sysctl_tcp_newcwv) {
+ if (tp->loss_flight_size)
+ tcp_newcwv_end_recovery(sk);
+ tcp_newcwv_reset(sk);
+ }
}
+ tp->loss_flight_size = 0;
+
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
@@ -2682,6 +2692,11 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
+ /* draft-ietf-tcpm-newcwv reduction at start of recovery */
+ if (sysctl_tcp_newcwv &&
+ tp->pipeack <= (tp->snd_cwnd*tp->mss_cache >> 1))
+ tcp_newcwv_enter_recovery(sk);
+
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
@@ -3501,6 +3516,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
sack_rtt_us);
acked -= tp->packets_out;
+ /* draft-ietf-tcpm-newcwv pipeack estimation */
+ if (sysctl_tcp_newcwv && icsk->icsk_ca_state <= TCP_CA_Disorder
+ && (flag & FLAG_DATA_ACKED))
+ tcp_newcwv_update_pipeack(sk);
+
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, acked);
diff --git a/net/ipv4/tcp_newcwv.c b/net/ipv4/tcp_newcwv.c
new file mode 100644
index 0000000..65a0cba
--- /dev/null
+++ b/net/ipv4/tcp_newcwv.c
@@ -0,0 +1,143 @@
+/*
+ * New-CWV implementation for Linux: draft-ietf-tcpm-newcwv.txt
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define nextbin(x) (((x)+1) & 0x03)
+#define prevbin(x) (((x)-1) & 0x03)
+
+int sysctl_tcp_newcwv __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_tcp_newcwv);
+
+/* returns pipeack in MSSs */
+static u32 pa_pkts(struct tcp_sock* tp)
+{
+ if (tp->pipeack == NCWV_UNDEF || !tp->mss_cache)
+ return 0;
+
+ return (tp->pipeack)/tp->mss_cache;
+}
+
+/* initialies newcwv variables */
+void tcp_newcwv_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->prev_snd_una = tp->snd_una;
+ tp->prev_snd_nxt = tp->snd_nxt;
+ tp->cwnd_valid_ts = tcp_time_stamp;
+ tp->loss_flight_size = 0;
+
+ tp->head = 0;
+ tp->psample[0] = NCWV_UNDEF;
+ tp->pipeack = NCWV_UNDEF;
+}
+
+/* This fuction removes all the expired elements from the circular buffer
+ * and returns the maximum from the remaining elements
+ */
+static int remove_expired_element(struct tcp_sock *tp, u32 psp)
+{
+ int k = tp->head;
+ int tmp = tp->psample[tp->head];
+
+ while (tp->psample[k] != NCWV_UNDEF) {
+ /* remove expired */
+ if (tp->time_stamp[k] < tcp_time_stamp - psp) {
+ tp->psample[k] = NCWV_UNDEF;
+ continue;
+ }
+
+ /* search the maximum */
+ if (tp->psample[k] > tmp)
+ tmp = tp->psample[k];
+
+ k = prevbin(k);
+ if (k == tp->head)
+ return tmp;
+ }
+
+ return tmp;
+}
+
+/* updates pipeack when an ACK is received */
+void tcp_newcwv_update_pipeack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 tmp_pipeack, psp, h;
+
+ psp = max_t(u32, 3*usecs_to_jiffies(tp->srtt_us>>3), (u32) HZ);
+
+ if (tp->snd_una >= tp->prev_snd_nxt) {
+
+ /* now get a new pipeack sample */
+ tmp_pipeack = tp->snd_una - tp->prev_snd_una;
+ tp->prev_snd_una = tp->snd_una;
+ tp->prev_snd_nxt = tp->snd_nxt;
+
+ h = tp->head;
+
+ /* create a new element at the end of current pmp */
+ if (tp->psample[h] == NCWV_UNDEF ||
+ tcp_time_stamp > tp->time_stamp[h] + (psp >> 2)) {
+
+ /* Add an element to circular the buffer */
+ tp->head = nextbin(h);
+ tp->psample[tp->head] = tmp_pipeack;
+ tp->time_stamp[tp->head] = tcp_time_stamp;
+
+ } else if (tmp_pipeack > tp->psample[h])
+ tp->psample[h] = tmp_pipeack;
+ }
+
+ tp->pipeack = remove_expired_element(tp, psp);
+}
+
+/* newcwv actions at loss detection */
+void tcp_newcwv_enter_recovery(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 pipe;
+
+ if (tp->pipeack == NCWV_UNDEF)
+ return;
+
+ tp->prior_retrans = tp->total_retrans;
+ tp->loss_flight_size = tcp_packets_in_flight(tp);
+
+ pipe = max_t(u32, pa_pkts(tp), tp->loss_flight_size);
+ tp->snd_cwnd = max_t(u32, pipe >> 1, 1UL);
+}
+
+/* newcwv actions at the end of recovery */
+void tcp_newcwv_end_recovery(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 retrans, pipe;
+
+ retrans = tp->total_retrans - tp->prior_retrans;
+ pipe = max_t(u32, pa_pkts(tp), tp->loss_flight_size) - retrans;
+ tp->snd_ssthresh = max_t(u32, pipe >> 1, 1UL);
+ tp->snd_cwnd = tp->snd_ssthresh;
+}
+
+
+/* reduces the cwnd after 5mins of non-validated phase */
+void tcp_newcwv_datalim_closedown(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ while ((tcp_time_stamp - tp->cwnd_valid_ts) > NCWV_FIVEMINS &&
+ tp->snd_cwnd > TCP_INIT_CWND) {
+ tp->cwnd_valid_ts += NCWV_FIVEMINS;
+
+ tp->snd_ssthresh =
+ max_t(u32, (3 * tp->snd_cwnd) >> 2, tp->snd_ssthresh);
+ tp->snd_cwnd =
+ max_t(u32, tp->snd_cwnd >> 1, TCP_INIT_CWND);
+ }
+}
+
+
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a3d453b..9f0e532 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -168,6 +168,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
const u32 now = tcp_time_stamp;
const struct dst_entry *dst = __sk_dst_get(sk);
+ if (sysctl_tcp_newcwv)
+ tcp_newcwv_datalim_closedown(sk);
+
if (sysctl_tcp_slow_start_after_idle &&
(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
tcp_cwnd_restart(sk, __sk_dst_get(sk));