patch-2.3.41 linux/net/ipv4/tcp_output.c
Next file: linux/net/ipv4/tcp_timer.c
Previous file: linux/net/ipv4/tcp_ipv4.c
Back to the patch index
Back to the overall index
- Lines: 929
- Date:
Sat Jan 22 11:54:57 2000
- Orig file:
v2.3.40/linux/net/ipv4/tcp_output.c
- Orig date:
Fri Jan 21 18:19:17 2000
diff -u --recursive --new-file v2.3.40/linux/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.116 2000/01/13 00:19:49 davem Exp $
+ * Version: $Id: tcp_output.c,v 1.119 2000/01/19 04:06:15 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -31,6 +31,7 @@
* during syn/ack processing.
* David S. Miller : Output engine completely rewritten.
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
+ * Cacophonix Gaul : draft-minshall-nagle-01
*
*/
@@ -38,75 +39,65 @@
#include <linux/smp_lock.h>
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse = 1;
-/* Get rid of any delayed acks, we sent one already.. */
-static __inline__ void clear_delayed_acks(struct sock * sk)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- tp->delayed_acks = 0;
- if(tcp_in_quickack_mode(tp))
- tcp_exit_quickack_mode(tp);
- tcp_clear_xmit_timer(sk, TIME_DACK);
-}
-
static __inline__ void update_send_head(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
+
tp->send_head = tp->send_head->next;
if (tp->send_head == (struct sk_buff *) &sk->write_queue)
tp->send_head = NULL;
}
/* Calculate mss to advertise in SYN segment.
- RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
-
- 1. It is independent of path mtu.
- 2. Ideally, it is maximal possible segment size i.e. 65535-40.
- 3. For IPv4 it is reasonable to calculate it from maximal MTU of
- attached devices, because some buggy hosts are confused by
- large MSS.
- 4. We do not make 3, we advertise MSS, calculated from first
- hop device mtu, but allow to raise it to ip_rt_min_advmss.
- This may be overriden via information stored in routing table.
- 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
- probably even Jumbo".
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ * attached devices, because some buggy hosts are confused by
+ * large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ * hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ * This may be overriden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ * probably even Jumbo".
*/
static __u16 tcp_advertise_mss(struct sock *sk)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct dst_entry *dst = __sk_dst_get(sk);
- int mss;
+ int mss = tp->advmss;
- if (dst) {
+ if (dst && dst->advmss < mss) {
mss = dst->advmss;
- } else {
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ tp->advmss = mss;
+ }
- /* No dst. It is bad. Guess some reasonable value.
- * Actually, this case should not be possible.
- * SANITY.
- */
- BUG_TRAP(dst!=NULL);
+ return (__u16)mss;
+}
- mss = tp->mss_cache;
- mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
- tp->ext_header_len;
+static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ /* If we had a reply for ato after last received
+ * packet, enter pingpong mode.
+ */
+ if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato)
+ tp->ack.pingpong = 1;
- /* Minimal MSS to include full set of of TCP/IP options
- plus 8 bytes of data. It corresponds to mtu 128.
- */
- if (mss < 88)
- mss = 88;
- }
+ tp->lsndtime = tcp_time_stamp;
+}
- return (__u16)mss;
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tp->last_ack_sent = tp->rcv_nxt;
+ tcp_dec_quickack_mode(tp);
+ tp->ack.pending = 0;
+ tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
}
/* This routine actually transmits TCP packets queued in by
@@ -120,7 +111,7 @@
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if(skb != NULL) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -128,6 +119,7 @@
int tcp_header_size = tp->tcp_header_len;
struct tcphdr *th;
int sysctl_flags;
+ int err;
#define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2
@@ -190,11 +182,29 @@
}
tp->af_specific->send_check(sk, th, skb->len, skb);
- clear_delayed_acks(sk);
- tp->last_ack_sent = tp->rcv_nxt;
+ if (th->ack)
+ tcp_event_ack_sent(sk);
+
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, skb);
+
TCP_INC_STATS(TcpOutSegs);
- tp->af_specific->queue_xmit(skb);
+
+ err = tp->af_specific->queue_xmit(skb);
+ if (err <= 0)
+ return err;
+
+ tcp_enter_cong_avoid(tp);
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
}
+ return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
@@ -202,32 +212,33 @@
/* This is the main buffer sending routine. We queue the buffer
* and decide whether to queue or transmit now.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
*/
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* Advance write_seq and place onto the write_queue. */
- tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
+ tp->write_seq = TCP_SKB_CB(skb)->end_seq;
__skb_queue_tail(&sk->write_queue, skb);
- if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+ if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) {
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
- if(!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
- } else {
- /* Queue it, remembering where we must start sending. */
- if (tp->send_head == NULL)
- tp->send_head = skb;
- if (!force_queue && tp->packets_out == 0 && !tp->pending) {
- tp->pending = TIME_PROBE0;
- tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+ if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) {
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_minshall_update(tp, cur_mss, skb->len);
+ tp->packets_out++;
+ if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ return;
}
}
+ /* Queue it, remembering where we must start sending. */
+ if (tp->send_head == NULL)
+ tp->send_head = skb;
}
/* Function to create two new TCP segments. Shrinks the given segment
@@ -243,13 +254,13 @@
/* Get a new skb... force flag on. */
buff = sock_wmalloc(sk,
- (nsize + MAX_HEADER + sk->prot->max_header),
+ (nsize + MAX_TCP_HEADER + 15),
1, GFP_ATOMIC);
if (buff == NULL)
- return -1; /* We'll just try again later. */
+ return -ENOMEM; /* We'll just try again later. */
/* Reserve space for headers. */
- skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(buff, MAX_TCP_HEADER);
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
@@ -276,8 +287,8 @@
TCP_SKB_CB(buff)->sacked = 0;
/* Copy and checksum data tail into the new buffer. */
- buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
- nsize, 0);
+ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+ nsize, 0);
/* This takes care of the FIN sequence number too. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
@@ -288,6 +299,11 @@
/* Looks stupid, but our code really uses when of
* skbs, which it never sent before. --ANK
+ *
+ * NOTE: several days after I added this, Dave repaired
+ * tcp_simple_retransmit() and it should not use ->when
+ * of never sent skbs more. I am not sure, so that
+ * this line remains until more careful investigation. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
@@ -335,20 +351,19 @@
if (mss_now > tp->mss_clamp)
mss_now = tp->mss_clamp;
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
/* Now subtract optional transport overhead */
mss_now -= tp->ext_header_len;
- /* It we got too small (or even negative) value,
- clamp it by 8 from below. Why 8 ?
- Well, it could be 1 with the same success,
- but if IP accepted segment of length 1,
- it would love 8 even more 8) --ANK (980731)
- */
- if (mss_now < 8)
- mss_now = 8;
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ /* Bound mss with half of window */
+ if (tp->max_window && mss_now > (tp->max_window>>1))
+ mss_now = max((tp->max_window>>1), 1);
/* And store cached results */
tp->pmtu_cookie = pmtu;
@@ -360,27 +375,30 @@
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
*/
-void tcp_write_xmit(struct sock *sk)
+int tcp_write_xmit(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
unsigned int mss_now;
- /* Account for SACKS, we may need to fragment due to this.
- * It is just like the real MSS changing on us midstream.
- * We also handle things correctly when the user adds some
- * IP options mid-stream. Silly to do, but cover it.
- */
- mss_now = tcp_current_mss(sk);
-
- /* If we are zapped, the bytes will have to remain here.
- * In time closedown will empty the write queue and all
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and all
* will be happy.
*/
- if(!sk->zapped) {
+ if(sk->state != TCP_CLOSE) {
struct sk_buff *skb;
int sent_pkts = 0;
+ /* Account for SACKS, we may need to fragment due to this.
+ * It is just like the real MSS changing on us midstream.
+ * We also handle things correctly when the user adds some
+ * IP options mid-stream. Silly to do, but cover it.
+ */
+ mss_now = tcp_current_mss(sk);
+
/* Anything on the transmit queue that fits the window can
* be added providing we are:
*
@@ -388,27 +406,36 @@
* b) not exceeding our congestion window.
* c) not retransmitting [Nagle]
*/
- while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
+ while((skb = tp->send_head) &&
+ tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) {
if (skb->len > mss_now) {
if (tcp_fragment(sk, skb, mss_now))
break;
}
- /* Advance the send_head. This one is going out. */
- update_send_head(sk);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+ break;
+ /* Advance the send_head. This one is sent out. */
+ update_send_head(sk);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tcp_minshall_update(tp, mss_now, skb->len);
tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
sent_pkts = 1;
}
/* If we sent anything, make sure the retransmit
* timer is active.
*/
- if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ if (sent_pkts) {
+ if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ return 0;
+ }
+
+ return !tp->packets_out && tp->send_head;
}
+ return 0;
}
/* This function returns the amount that we can raise the
@@ -471,7 +498,7 @@
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
- unsigned int mss = tp->rcv_mss;
+ unsigned int mss = tp->ack.rcv_mss;
int free_space;
u32 window;
@@ -481,11 +508,19 @@
free_space = tp->window_clamp;
if (tp->window_clamp < mss)
mss = tp->window_clamp;
-
- if ((free_space < (tcp_full_space(sk) / 2)) &&
+
+ if ((free_space < (min((int)tp->window_clamp, tcp_full_space(sk)) / 2)) &&
(free_space < ((int) (mss/2)))) {
window = 0;
- tp->pred_flags = 0;
+
+ /* THIS IS _VERY_ GOOD PLACE to play window clamp.
+ * if free_space becomes suspiciously low
+ * verify ratio rmem_alloc/(rcv_nxt - copied_seq),
+ * and if we predict that when free_space will be lower mss,
+ * rmem_alloc will run out of rcvbuf*2, shrink window_clamp.
+ * It will eliminate most of prune events! Very simple,
+ * it is the next thing to do. --ANK
+ */
} else {
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
@@ -542,9 +577,9 @@
/* Optimize, actually we could also combine next_skb->csum
* to skb->csum using a single add w/carry operation too.
*/
- skb->csum = csum_partial_copy(next_skb->data,
- skb_put(skb, next_skb_size),
- next_skb_size, skb->csum);
+ skb->csum = csum_partial_copy_nocheck(next_skb->data,
+ skb_put(skb, next_skb_size),
+ next_skb_size, skb->csum);
}
/* Update sequence range on original skb. */
@@ -603,8 +638,10 @@
if (old_next_skb != skb || skb->len > mss)
resend_skb = 1;
old_next_skb = skb->next;
- if (resend_skb != 0)
- tcp_retransmit_skb(sk, skb);
+ if (resend_skb != 0) {
+ if (tcp_retransmit_skb(sk, skb))
+ break;
+ }
}
}
@@ -629,9 +666,21 @@
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
unsigned int cur_mss = tcp_current_mss(sk);
+#ifdef TCP_DEBUG
+ /* It was possible this summer, that retransmit timer
+ * raced with its deletion and hit socket with packets_out==0.
+ * I fixed it, but preserved the check in the place,
+ * where the fault occured. --ANK
+ */
+ if (skb == NULL) {
+ printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk));
+ return -EFAULT;
+ }
+#endif
+
if(skb->len > cur_mss) {
if(tcp_fragment(sk, skb, cur_mss))
- return 1; /* We'll try again later. */
+ return -ENOMEM; /* We'll try again later. */
/* New SKB created, account for it. */
tp->packets_out++;
@@ -646,7 +695,7 @@
tcp_retrans_try_collapse(sk, skb, cur_mss);
if(tp->af_specific->rebuild_header(sk))
- return 1; /* Routing failure or similar. */
+ return -EHOSTUNREACH; /* Routing failure or similar. */
/* Some Solaris stacks overoptimize and ignore the FIN on a
* retransmit when old data is attached. So strip it off
@@ -673,13 +722,10 @@
else
skb = skb_clone(skb, GFP_ATOMIC);
- tcp_transmit_skb(sk, skb);
-
/* Update global TCP statistics and return success. */
- sk->prot->retransmits++;
TCP_INC_STATS(TcpRetransSegs);
- return 0;
+ return tcp_transmit_skb(sk, skb);
}
/* This gets called after a retransmit timeout, and the initially
@@ -774,7 +820,11 @@
*/
mss_now = tcp_current_mss(sk);
- if((tp->send_head != NULL) && (skb->len < mss_now)) {
+ /* Please, find seven differences of 2.3.33 and loook
+ * what I broke here. 8) --ANK
+ */
+
+ if(tp->send_head != NULL) {
/* tcp_write_xmit() takes care of the rest. */
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
TCP_SKB_CB(skb)->end_seq++;
@@ -783,31 +833,34 @@
/* Special case to avoid Nagle bogosity. If this
* segment is the last segment, and it was queued
* due to Nagle/SWS-avoidance, send it out now.
+ *
+ * Hmm... actually it overrides also congestion
+ * avoidance (OK for FIN) and retransmit phase
+ * (not OK? Added.).
*/
if(tp->send_head == skb &&
- !sk->nonagle &&
- skb->len < (tp->rcv_mss >> 1) &&
- tp->packets_out &&
- !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
- update_send_head(sk);
+ !after(tp->write_seq, tp->snd_una + tp->snd_wnd) &&
+ !tp->retransmits) {
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
- if(!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) {
+ update_send_head(sk);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ } else
+ tcp_check_probe_timer(sk, tp);
}
} else {
/* Socket is locked, keep trying until memory is available. */
do {
skb = sock_wmalloc(sk,
- (MAX_HEADER +
- sk->prot->max_header),
+ MAX_TCP_HEADER + 15,
1, GFP_KERNEL);
} while (skb == NULL);
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
TCP_SKB_CB(skb)->sacked = 0;
@@ -816,7 +869,8 @@
/* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
TCP_SKB_CB(skb)->seq = tp->write_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
- tcp_send_skb(sk, skb, 0);
+ tcp_send_skb(sk, skb, 0, mss_now);
+ __tcp_push_pending_frames(sk, tp, mss_now);
}
}
@@ -831,19 +885,19 @@
struct sk_buff *skb;
/* NOTE: No TCP options attached and we never retransmit this. */
- skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
+ skb = alloc_skb(MAX_TCP_HEADER + 15, priority);
if (!skb)
return;
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->urg_ptr = 0;
/* Send it off. */
- TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->seq = tp->snd_nxt;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_transmit_skb(sk, skb);
@@ -859,13 +913,13 @@
struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff* skb;
- skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15,
1, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
TCP_SKB_CB(skb)->sacked = 0;
@@ -877,8 +931,7 @@
__skb_queue_tail(&sk->write_queue, skb);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
- return 0;
+ return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
}
/*
@@ -887,16 +940,17 @@
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct open_request *req)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct tcphdr *th;
int tcp_header_size;
struct sk_buff *skb;
- skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
if (skb == NULL)
return NULL;
/* Reserve space for headers. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->dst = dst_clone(dst);
@@ -919,7 +973,7 @@
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
__u8 rcv_wscale;
/* Set this up on the first call only */
- req->window_clamp = skb->dst->window;
+ req->window_clamp = tp->window_clamp ? : skb->dst->window;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -951,7 +1005,7 @@
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* Reserve space for headers. */
- skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(buff, MAX_TCP_HEADER + 15);
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -962,12 +1016,16 @@
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->user_mss)
tp->mss_clamp = tp->user_mss;
+ tp->max_window = 0;
tcp_sync_mss(sk, dst->pmtu);
+ tcp_initialize_rcv_mss(sk);
- tp->window_clamp = dst->window;
+ if (!tp->window_clamp)
+ tp->window_clamp = dst->window;
+ tp->advmss = dst->advmss;
tcp_select_initial_window(tcp_full_space(sk),
- dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
+ tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
@@ -982,10 +1040,12 @@
goto err_out;
sk->err = 0;
+ sk->done = 0;
tp->snd_wnd = 0;
tp->snd_wl1 = 0;
tp->snd_wl2 = tp->write_seq;
tp->snd_una = tp->write_seq;
+ tp->snd_sml = tp->write_seq;
tp->rcv_nxt = 0;
tp->rcv_wup = 0;
tp->copied_seq = 0;
@@ -1006,13 +1066,14 @@
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tp->syn_stamp = TCP_SKB_CB(buff)->when;
__skb_queue_tail(&sk->write_queue, buff);
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
TCP_INC_STATS(TcpActiveOpens);
/* Timer for repeating the SYN until an answer. */
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
return 0;
err_out:
@@ -1025,16 +1086,14 @@
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
*/
-void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
unsigned long timeout;
/* Stay within the limit we were given */
- timeout = (tp->ato << 1) >> 1;
- if (timeout > max_timeout)
- timeout = max_timeout;
- timeout += jiffies;
+ timeout = tp->ack.ato;
+ timeout += jiffies + (timeout>>2);
/* Use new timeout only if there wasn't a older one earlier. */
spin_lock_bh(&sk->timer_lock);
@@ -1042,18 +1101,46 @@
sock_hold(sk);
tp->delack_timer.expires = timeout;
} else {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ */
+ if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) {
+ spin_unlock_bh(&sk->timer_lock);
+
+ tcp_send_ack(sk);
+ __sock_put(sk);
+ return;
+ }
+
if (time_before(timeout, tp->delack_timer.expires))
tp->delack_timer.expires = timeout;
}
add_timer(&tp->delack_timer);
spin_unlock_bh(&sk->timer_lock);
+
+#ifdef TCP_FORMAL_WINDOW
+ /* Explanation. Header prediction path does not handle
+ * case of zero window. If we send ACK immediately, pred_flags
+ * are reset when sending ACK. If rcv_nxt is advanced and
+ * ack is not sent, than delayed ack is scheduled.
+ * Hence, it is the best place to check for zero window.
+ */
+ if (tp->pred_flags) {
+ if (tcp_receive_window(tp) == 0)
+ tp->pred_flags = 0;
+ } else {
+ if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+ !tp->urg_data)
+ tcp_fast_path_on(tp);
+ }
+#endif
}
/* This routine sends an ack and also updates the window. */
void tcp_send_ack(struct sock *sk)
{
/* If we have been reset, we may not send again. */
- if(!sk->zapped) {
+ if(sk->state != TCP_CLOSE) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *buff;
@@ -1061,29 +1148,15 @@
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (buff == NULL) {
- /* Force it to send an ack. We don't have to do this
- * (ACK is unreliable) but it's much better use of
- * bandwidth on slow links to send a spare ack than
- * resend packets.
- *
- * This is the one possible way that we can delay an
- * ACK and have tp->ato indicate that we are in
- * quick ack mode, so clear it. It is also the only
- * possible way for ato to be zero, when ACK'ing a
- * SYNACK because we've taken no ATO measurement yet.
- */
- if (tcp_in_quickack_mode(tp))
- tcp_exit_quickack_mode(tp);
- if (!tp->ato)
- tp->ato = tp->rto;
- tcp_send_delayed_ack(sk, HZ/2);
+ tp->ack.pending = 1;
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
return;
}
/* Reserve space for headers and prepare control bits. */
- skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(buff, MAX_TCP_HEADER);
buff->csum = 0;
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(buff)->sacked = 0;
@@ -1099,24 +1172,20 @@
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
*/
-void tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk)
{
- /* After a valid reset we can send no more. */
- if (!sk->zapped) {
+ if (sk->state != TCP_CLOSE) {
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
- /* Write data can still be transmitted/retransmitted in the
- * following states. If any other state is encountered, return.
- * [listen/close will never occur here anyway]
+ /* Now this function is never called, while
+ * we have something not ACKed in queue.
*/
- if ((1 << sk->state) &
- ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
- TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
- return;
+ BUG_TRAP(tp->snd_una == tp->snd_nxt);
- if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
- ((skb = tp->send_head) != NULL)) {
+ if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una)
+ && ((skb = tp->send_head) != NULL)) {
+ int err;
unsigned long win_size;
/* We are probing the opening of a window
@@ -1126,24 +1195,26 @@
win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
if (tcp_fragment(sk, skb, win_size))
- return; /* Let a retransmit get it. */
+ return -1;
}
- update_send_head(sk);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->packets_out++;
- tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
- if (!tcp_timer_is_set(sk, TIME_RETRANS))
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ if (!err) {
+ update_send_head(sk);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ }
+ return err;
} else {
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
- GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
if (skb == NULL)
- return;
+ return -1;
/* Reserve space for headers and set control bits. */
- skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb_reserve(skb, MAX_TCP_HEADER);
skb->csum = 0;
TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(skb)->sacked = 0;
@@ -1152,13 +1223,18 @@
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
* send it.
+ *
+ * RED-PEN: logically it should be snd_una-1.
+ * snd_nxt-1 will not be acked. snd_una==snd_nxt
+ * in this place however. Right?
*/
- TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
+ TCP_SKB_CB(skb)->seq = tp->snd_una - 1;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, skb);
+ return tcp_transmit_skb(sk, skb);
}
}
+ return -1;
}
/* A window probe timeout has occurred. If window is not closed send
@@ -1167,11 +1243,32 @@
void tcp_send_probe0(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int err;
+
+ err = tcp_write_wakeup(sk);
+
+ if (tp->packets_out || !tp->send_head) {
+ /* Cancel probe timer, if it is not required. */
+ tp->probes_out = 0;
+ tp->backoff = 0;
+ return;
+ }
- tcp_write_wakeup(sk);
- tp->pending = TIME_PROBE0;
- tp->backoff++;
- tp->probes_out++;
- tcp_reset_xmit_timer (sk, TIME_PROBE0,
- min(tp->rto << tp->backoff, 120*HZ));
+ if (err <= 0) {
+ tp->backoff++;
+ tp->probes_out++;
+ tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ } else {
+ /* If packet was not sent due to local congestion,
+ * do not backoff and do not remember probes_out.
+ * Let local senders to fight for local resources.
+ *
+ * Use accumulated backoff yet.
+ */
+ if (!tp->probes_out)
+ tp->probes_out=1;
+ tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
+ min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+ }
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)