patch-2.3.15 linux/include/net/tcp.h
Next file: linux/include/net/udp.h
Previous file: linux/include/net/sock.h
Back to the patch index
Back to the overall index
- Lines: 624
- Date:
Tue Aug 24 17:12:21 1999
- Orig file:
v2.3.14/linux/include/net/tcp.h
- Orig date:
Wed Aug 18 16:44:39 1999
diff -u --recursive --new-file v2.3.14/linux/include/net/tcp.h linux/include/net/tcp.h
@@ -18,17 +18,25 @@
#ifndef _TCP_H
#define _TCP_H
+#define TCP_DEBUG 1
+
#include <linux/config.h>
#include <linux/tcp.h>
#include <linux/slab.h>
#include <net/checksum.h>
+#include <net/sock.h>
/* This is for all connections with a full identity, no wildcards.
* New scheme, half the table is for TIME_WAIT, the other half is
* for the rest. I'll experiment with dynamic table growth later.
*/
+struct tcp_ehash_bucket {
+ rwlock_t lock;
+ struct sock *chain;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
extern int tcp_ehash_size;
-extern struct sock **tcp_ehash;
+extern struct tcp_ehash_bucket *tcp_ehash;
/* This is for listening sockets, thus all sockets which possess wildcards. */
#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
@@ -38,6 +46,9 @@
* the port space is shared.
*/
extern struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
+extern rwlock_t tcp_lhash_lock;
+extern atomic_t tcp_lhash_users;
+extern wait_queue_head_t tcp_lhash_wait;
/* There are a few simple rules, which allow for local port reuse by
* an application. In essence:
@@ -78,33 +89,21 @@
struct tcp_bind_bucket **pprev;
};
-extern struct tcp_bind_bucket **tcp_bhash;
+struct tcp_bind_hashbucket {
+ spinlock_t lock;
+ struct tcp_bind_bucket *chain;
+};
+
+extern struct tcp_bind_hashbucket *tcp_bhash;
extern int tcp_bhash_size;
+extern spinlock_t tcp_portalloc_lock;
extern kmem_cache_t *tcp_bucket_cachep;
-extern struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum);
+extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
+ unsigned short snum);
extern void tcp_bucket_unlock(struct sock *sk);
extern int tcp_port_rover;
-extern struct sock *tcp_v4_lookup_listener(u32 addr, unsigned short hnum, int dif);
-
-/* Level-1 socket-demux cache. */
-#define TCP_NUM_REGS 32
-extern struct sock *tcp_regs[TCP_NUM_REGS];
-
-#define TCP_RHASH_FN(__fport) \
- ((((__fport) >> 7) ^ (__fport)) & (TCP_NUM_REGS - 1))
-#define TCP_RHASH(__fport) tcp_regs[TCP_RHASH_FN((__fport))]
-#define TCP_SK_RHASH_FN(__sock) TCP_RHASH_FN((__sock)->dport)
-#define TCP_SK_RHASH(__sock) tcp_regs[TCP_SK_RHASH_FN((__sock))]
-
-static __inline__ void tcp_reg_zap(struct sock *sk)
-{
- struct sock **rpp;
-
- rpp = &(TCP_SK_RHASH(sk));
- if(*rpp == sk)
- *rpp = NULL;
-}
+extern struct sock *tcp_v4_lookup_listener(u32 addr, unsigned short hnum, int dif);
/* These are AF independent. */
static __inline__ int tcp_bhashfn(__u16 lport)
@@ -121,8 +120,6 @@
* XXX Yes I know this is gross, but I'd have to edit every single
* XXX networking file if I created a "struct sock_header". -DaveM
*/
- struct sock *bind_next;
- struct sock **bind_pprev;
__u32 daddr;
__u32 rcv_saddr;
__u16 dport;
@@ -130,20 +127,30 @@
int bound_dev_if;
struct sock *next;
struct sock **pprev;
+ struct sock *bind_next;
+ struct sock **bind_pprev;
unsigned char state,
zapped;
__u16 sport;
unsigned short family;
unsigned char reuse,
nonagle;
+ atomic_t refcnt;
/* And these are ours. */
+ int hashent;
__u32 rcv_nxt;
- struct tcp_func *af_specific;
+ __u32 snd_nxt;
+ __u32 ts_recent;
+ long ts_recent_stamp;
struct tcp_bind_bucket *tb;
struct tcp_tw_bucket *next_death;
struct tcp_tw_bucket **pprev_death;
int death_slot;
+#ifdef CONFIG_TCP_TW_RECYCLE
+ unsigned long ttd;
+ int rto;
+#endif
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct in6_addr v6_daddr;
struct in6_addr v6_rcv_saddr;
@@ -152,6 +159,23 @@
extern kmem_cache_t *tcp_timewait_cachep;
+extern __inline__ void tcp_tw_put(struct tcp_tw_bucket *tw)
+{
+ if (atomic_dec_and_test(&tw->refcnt)) {
+#ifdef INET_REFCNT_DEBUG
+ printk(KERN_DEBUG "tw_bucket %p released\n", tw);
+#endif
+ kmem_cache_free(tcp_timewait_cachep, tw);
+ }
+}
+
+extern int tcp_tw_death_row_slot;
+extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
+extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
+extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
+extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
+
+
/* Socket demux engine toys. */
#ifdef __BIG_ENDIAN
#define TCP_COMBINED_PORTS(__sport, __dport) \
@@ -221,10 +245,14 @@
* poor stacks do signed 16bit maths!
*/
#define MAX_WINDOW 32767
-#define MIN_WINDOW 2048
-#define MAX_ACK_BACKLOG 2
#define MAX_DELAY_ACK 2
-#define TCP_WINDOW_DIFF 2048
+
+/*
+ * How much of the receive buffer do we advertize
+ * (the rest is reserved for headers and driver packet overhead)
+ * Use a power of 2.
+ */
+#define WINDOW_ADVERTISE_DIVISOR 2
/* urg_data states */
#define URG_VALID 0x0100
@@ -248,8 +276,6 @@
#define TCP_FIN_TIMEOUT (3*60*HZ) /* BSD style FIN_WAIT2 deadlock breaker */
#define TCP_ACK_TIME (3*HZ) /* time to delay before sending an ACK */
-#define TCP_DONE_TIME (5*HZ/2)/* maximum time to wait before actually
- * destroying a socket */
#define TCP_WRITE_TIME (30*HZ) /* initial time to wait for an ACK,
* after last transmit */
#define TCP_TIMEOUT_INIT (3*HZ) /* RFC 1122 initial timeout value */
@@ -267,8 +293,6 @@
* we tell the link layer that it is something
* wrong (e.g. that it can expire redirects) */
-#define TCP_BUCKETGC_PERIOD (HZ)
-
/* TIME_WAIT reaping mechanism. */
#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
#define TCP_TWKILL_PERIOD ((HZ*60)/TCP_TWKILL_SLOTS)
@@ -302,10 +326,18 @@
#define TCPOLEN_SACK_BASE_ALIGNED 4
#define TCPOLEN_SACK_PERBLOCK 8
+#define TIME_WRITE 1 /* Not yet used */
+#define TIME_RETRANS 2 /* Retransmit timer */
+#define TIME_DACK 3 /* Delayed ack timer */
+#define TIME_PROBE0 4
+#define TIME_KEEPOPEN 5
+
struct open_request;
struct or_calltable {
+ int family;
void (*rtx_syn_ack) (struct sock *sk, struct open_request *req);
+ void (*send_ack) (struct sk_buff *skb, struct open_request *req);
void (*destructor) (struct open_request *req);
void (*send_reset) (struct sk_buff *skb);
};
@@ -352,9 +384,6 @@
struct tcp_v6_open_req v6_req;
#endif
} af;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- __u16 lcl_port; /* LVE */
-#endif
};
/* SLAB cache for open requests. */
@@ -363,6 +392,12 @@
#define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC)
#define tcp_openreq_free(req) kmem_cache_free(tcp_openreq_cachep, req)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define TCP_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define TCP_INET_FAMILY(fam) 1
+#endif
+
/*
* Pointers to address related TCP functions
* (i.e. things that depend on the address family)
@@ -376,7 +411,7 @@
*/
struct tcp_func {
- void (*queue_xmit) (struct sk_buff *skb);
+ int (*queue_xmit) (struct sk_buff *skb);
void (*send_check) (struct sock *sk,
struct tcphdr *th,
@@ -386,16 +421,14 @@
int (*rebuild_header) (struct sock *sk);
int (*conn_request) (struct sock *sk,
- struct sk_buff *skb,
- __u32 isn);
+ struct sk_buff *skb);
struct sock * (*syn_recv_sock) (struct sock *sk,
struct sk_buff *skb,
struct open_request *req,
struct dst_entry *dst);
- struct sock * (*get_sock) (struct sk_buff *skb,
- struct tcphdr *th);
+ int (*hash_connecting) (struct sock *sk);
__u16 net_header_len;
@@ -474,14 +507,26 @@
struct tcphdr *th,
unsigned len);
-extern int tcp_timewait_state_process(struct tcp_tw_bucket *tw,
+enum tcp_tw_status
+{
+ TCP_TW_SUCCESS = 0,
+ TCP_TW_RST = 1,
+ TCP_TW_ACK = 2,
+ TCP_TW_SYN = 3
+};
+
+extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw,
struct sk_buff *skb,
struct tcphdr *th,
unsigned len);
+extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
+ struct open_request *req,
+ struct open_request *prev);
+
extern void tcp_close(struct sock *sk,
long timeout);
-extern struct sock * tcp_accept(struct sock *sk, int flags);
+extern struct sock * tcp_accept(struct sock *sk, int flags, int *err);
extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait);
extern void tcp_write_space(struct sock *sk);
@@ -514,8 +559,7 @@
struct sk_buff *skb);
extern int tcp_v4_conn_request(struct sock *sk,
- struct sk_buff *skb,
- __u32 isn);
+ struct sk_buff *skb);
extern struct sock * tcp_create_openreq_child(struct sock *sk,
struct open_request *req,
@@ -533,14 +577,18 @@
struct sockaddr *uaddr,
int addr_len);
-extern void tcp_connect(struct sock *sk,
- struct sk_buff *skb,
- int est_mss);
+extern int tcp_connect(struct sock *sk,
+ struct sk_buff *skb);
extern struct sk_buff * tcp_make_synack(struct sock *sk,
struct dst_entry *dst,
- struct open_request *req,
- int mss);
+ struct open_request *req);
+
+extern int tcp_disconnect(struct sock *sk, int flags);
+
+extern void tcp_unhash(struct sock *sk);
+
+extern int tcp_v4_hash_connecting(struct sock *sk);
/* From syncookies.c */
@@ -568,13 +616,9 @@
extern void tcp_transmit_skb(struct sock *, struct sk_buff *);
extern void tcp_send_skb(struct sock *, struct sk_buff *, int force_queue);
extern void tcp_send_ack(struct sock *sk);
-extern void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout);
-
-/* CONFIG_IP_TRANSPARENT_PROXY */
-extern int tcp_chkaddr(struct sk_buff *);
+extern void tcp_send_delayed_ack(struct sock *sk, int max_timeout);
/* tcp_timer.c */
-#define tcp_reset_msl_timer(x,y,z) net_reset_timer(x,y,z)
extern void tcp_reset_xmit_timer(struct sock *, int, unsigned long);
extern void tcp_init_xmit_timers(struct sock *);
extern void tcp_clear_xmit_timers(struct sock *);
@@ -583,8 +627,9 @@
extern void tcp_delack_timer(unsigned long);
extern void tcp_probe_timer(unsigned long);
-extern struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
- struct open_request *req);
+extern void tcp_delete_keepalive_timer (struct sock *);
+extern void tcp_reset_keepalive_timer (struct sock *, unsigned long);
+extern void tcp_keepalive_timer (unsigned long);
/*
* TCP slow timer
@@ -599,9 +644,8 @@
};
#define TCP_SLT_SYNACK 0
-#define TCP_SLT_KEEPALIVE 1
-#define TCP_SLT_TWKILL 2
-#define TCP_SLT_MAX 3
+#define TCP_SLT_TWKILL 1
+#define TCP_SLT_MAX 2
extern struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX];
@@ -626,6 +670,28 @@
return mss_now > 8 ? mss_now : 8;
}
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+
+extern __inline__ void tcp_initialize_rcv_mss(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss;
+
+ if (dst)
+ mss = dst->advmss;
+ else
+ mss = tp->mss_cache;
+
+ tp->rcv_mss = max(min(mss, 536), 8);
+}
+
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
@@ -686,21 +752,6 @@
return (new_win && (new_win > (cur_win << 1)));
}
-/* Recalculate snd_ssthresh, we want to set it to:
- *
- * one half the current congestion window, but no
- * less than two segments
- *
- * We must take into account the current send window
- * as well, however we keep track of that using different
- * units so a conversion is necessary. -DaveM
- */
-extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
-{
- __u32 snd_wnd_packets = tp->snd_wnd / max(tp->mss_cache, 1);
-
- return max(min(snd_wnd_packets, tp->snd_cwnd) >> 1, 2);
-}
/* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot
@@ -768,6 +819,32 @@
return tp->packets_out - tp->fackets_out + tp->retrans_out;
}
+/* Recalculate snd_ssthresh, we want to set it to:
+ *
+ * one half the current congestion window, but no
+ * less than two segments
+ *
+ * We must take into account the current send window
+ * as well, however we keep track of that using different
+ * units so a conversion is necessary. -DaveM
+ *
+ * RED-PEN.
+ * RFC 2581: "an easy mistake to make is to simply use cwnd,
+ * rather than FlightSize"
+ * I see no references to FlightSize here. snd_wnd is not FlightSize,
+ * it is also apriory characteristics.
+ *
+ * FlightSize = min((snd_nxt-snd_una)/mss, packets_out) ?
+ */
+extern __inline__ __u32 tcp_recalc_ssthresh(struct tcp_opt *tp)
+{
+ u32 FlightSize = (tp->snd_nxt - tp->snd_una)/tp->mss_cache;
+
+ FlightSize = min(FlightSize, tcp_packets_in_flight(tp));
+
+ return max(min(FlightSize, tp->snd_cwnd) >> 1, 2);
+}
+
/* This checks if the data bearing packet SKB (usually tp->send_head)
* should be put on the wire right now.
*/
@@ -797,6 +874,15 @@
!(TCP_SKB_CB(skb)->flags & (TCPCB_FLAG_URG|TCPCB_FLAG_FIN))))
nagle_check = 0;
+ /*
+ * Reset CWND after idle period longer rto. Actually, it would
+ * be better to save last send time, but VJ in SIGCOMM'88 proposes
+ * to use keepalive timestamp. Well, it is not good, certainly,
+ * because SMTP is still broken, but it is better than nothing yet.
+ */
+ if (tp->packets_out==0 && (s32)(tcp_time_stamp - tp->rcv_tstamp) > tp->rto)
+ tp->snd_cwnd = min(tp->snd_cwnd, 2);
+
/* Don't be strict about the congestion window for the
* final FIN frame. -DaveM
*/
@@ -845,6 +931,17 @@
TCPF_FIN_WAIT2|TCPF_SYN_RECV));
}
+extern __inline const int tcp_established(const int state)
+{
+ return ((1 << state) &
+ (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
+ TCPF_FIN_WAIT2));
+}
+
+
+extern void tcp_destroy_sock(struct sock *sk);
+
+
/*
* Calculate(/check) TCP checksum
*/
@@ -869,12 +966,6 @@
{
int oldstate = sk->state;
- sk->state = state;
-
-#ifdef STATE_TRACE
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
-#endif
-
switch (state) {
case TCP_ESTABLISHED:
if (oldstate != TCP_ESTABLISHED)
@@ -882,17 +973,31 @@
break;
case TCP_CLOSE:
- {
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- /* Should be about 2 rtt's */
- net_reset_timer(sk, TIME_DONE, min(tp->srtt * 2, TCP_DONE_TIME));
sk->prot->unhash(sk);
/* fall through */
- }
default:
if (oldstate==TCP_ESTABLISHED)
tcp_statistics.TcpCurrEstab--;
}
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->state = state;
+
+#ifdef STATE_TRACE
+ SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
+#endif
+}
+
+static __inline__ void tcp_done(struct sock *sk)
+{
+ sk->shutdown = SHUTDOWN_MASK;
+
+ if (!sk->dead)
+ sk->state_change(sk);
+ else
+ tcp_destroy_sock(sk);
}
static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp)
@@ -931,7 +1036,7 @@
/* We always get an MSS option.
* The option bytes which will be seen in normal data
* packets should timestamps be used, must be in the MSS
- * advertised. But we subtract them from sk->mss so
+ * advertised. But we subtract them from tp->mss_cache so
* that calculations in tcp_sendmsg are simpler etc.
* So account for this fact here if necessary. If we
* don't do this correctly, as a receiver we won't
@@ -965,7 +1070,7 @@
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
-extern __inline__ void tcp_select_initial_window(__u32 space, __u16 mss,
+extern __inline__ void tcp_select_initial_window(int space, __u32 mss,
__u32 *rcv_wnd,
__u32 *window_clamp,
int wscale_ok,
@@ -999,6 +1104,18 @@
(*window_clamp) = min(65535<<(*rcv_wscale),*window_clamp);
}
+/* Note: caller must be prepared to deal with negative returns */
+extern __inline__ int tcp_space(struct sock *sk)
+{
+ return (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) /
+ WINDOW_ADVERTISE_DIVISOR;
+}
+
+extern __inline__ int tcp_full_space( struct sock *sk)
+{
+ return sk->rcvbuf / WINDOW_ADVERTISE_DIVISOR;
+}
+
extern __inline__ void tcp_synq_unlink(struct tcp_opt *tp, struct open_request *req, struct open_request *prev)
{
if(!req->dl_next)
@@ -1060,29 +1177,58 @@
printk(timer_bug_msg);
return;
};
- if(timer->prev != NULL)
- del_timer(timer);
+
+ spin_lock_bh(&sk->timer_lock);
+ if (timer->prev != NULL && del_timer(timer))
+ __sock_put(sk);
+ spin_unlock_bh(&sk->timer_lock);
}
+/* This function does not return reliable answer. You is only as advice.
+ */
+
static inline int tcp_timer_is_set(struct sock *sk, int what)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ int ret;
switch (what) {
case TIME_RETRANS:
- return tp->retransmit_timer.prev != NULL;
+ ret = tp->retransmit_timer.prev != NULL;
break;
case TIME_DACK:
- return tp->delack_timer.prev != NULL;
+ ret = tp->delack_timer.prev != NULL;
break;
case TIME_PROBE0:
- return tp->probe_timer.prev != NULL;
+ ret = tp->probe_timer.prev != NULL;
break;
default:
+ ret = 0;
printk(timer_bug_msg);
};
- return 0;
+ return ret;
}
+
+extern void tcp_listen_wlock(void);
+
+/* - We may sleep inside this lock.
+ * - If sleeping is not required (or called from BH),
+ * use plain read_(un)lock(&tcp_lhash_lock).
+ */
+
+extern __inline__ void tcp_listen_lock(void)
+{
+ /* read_lock synchronizes to candidates to writers */
+ read_lock(&tcp_lhash_lock);
+ atomic_inc(&tcp_lhash_users);
+ read_unlock(&tcp_lhash_lock);
+}
+
+extern __inline__ void tcp_listen_unlock(void)
+{
+ if (atomic_dec_and_test(&tcp_lhash_users))
+ wake_up(&tcp_lhash_wait);
+}
#endif /* _TCP_H */
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)