Index: head/sys/kern/subr_witness.c
===================================================================
--- head/sys/kern/subr_witness.c	(revision 335355)
+++ head/sys/kern/subr_witness.c	(revision 335356)
@@ -1,3084 +1,3084 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(512 + (MAXCPU * 4))
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
     CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 #ifdef __i386__
 static const char w_notallowed[] = "The sysctl is disabled on the arch\n";
 #endif
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rm },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "in_multi_sx", &lock_class_sx },
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_list_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "ifnet_rw", &lock_class_rw },
 	{ "if_addr_lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "in6_multi_sx", &lock_class_sx },
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_list_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "ifnet_rw", &lock_class_rw },
 	{ "if_addr_lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
-	{ "udp", &lock_class_rw },
+	{ "udp", &lock_class_mtx_sleep },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
-	{ "tcp", &lock_class_rw },
+	{ "tcp", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_sx },
 	{ "bpf interface lock", &lock_class_rw },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VFS namecache
 	 */
 	{ "ncvn", &lock_class_mtx_sleep },
 	{ "ncbuc", &lock_class_rw },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "ncneg", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * TCP log locks
 	 */
 	{ "TCP ID tree", &lock_class_rw },
 	{ "tcp log id bucket", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "TCP log expireq", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 	{ "process slock", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	{ NULL, NULL },
 	{ "sched lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ NULL, NULL },
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * Calculate the size of early witness structures.
  */
 int
 witness_startup_count(void)
 {
 	int sz;
 
 	sz = sizeof(struct witness) * witness_count;
 	sz += sizeof(*w_rmatrix) * (witness_count + 1);
 	sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) *
 	    (witness_count + 1);
 
 	return (sz);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 void
 witness_startup(void *mem)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	uintptr_t p;
 	int i;
 
 	p = (uintptr_t)mem;
 	w_data = (void *)p;
 	p += sizeof(struct witness) * witness_count;
 
 	w_rmatrix = (void *)p;
 	p += sizeof(*w_rmatrix) * (witness_count + 1);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = (void *)p;
 		p += sizeof(*w_rmatrix[i]) * (witness_count + 1);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			} else {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			}
 			witness_debugger(1, __func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				vprintf(fmt, ap);
 				va_end(ap);
 				printf(" with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, printf);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		vprintf(fmt, ap);
 		va_end(ap);
 		printf(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, printf);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	if (w->w_refcount == 1)
 		w->w_class = lock_class;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 		    "lock (%s) %s does not match earlier (%s) lock",
 		    description, lock_class->lc_name,
 		    w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		w_sleep_cnt--;
 	} else {
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static void
 sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	int generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			*oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				*oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		*oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 }
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_print_witness_badstacks(sb, &req->oldidx);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 #ifdef DDB
 static int
 sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
 {
 
 	return (db_printf("%.*s", len, data));
 }
 
 DB_SHOW_COMMAND(badstacks, db_witness_badstacks)
 {
 	struct sbuf sb;
 	char buffer[128];
 	size_t dummy;
 
 	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
 	sbuf_print_witness_badstacks(&sb, &dummy);
 	sbuf_finish(&sb);
 }
 #endif
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 #ifdef __i386__
 	error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed));
 	return (error);
 #endif
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_zero(&st);
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h	(revision 335355)
+++ head/sys/netinet/in_pcb.h	(revision 335356)
@@ -1,892 +1,891 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_PCB_H_
 #define _NETINET_IN_PCB_H_
 
 #include <sys/queue.h>
 #include <sys/epoch.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 #include <net/route.h>
 
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <net/vnet.h>
+#include <net/if.h>
+#include <net/if_var.h>
 #include <vm/uma.h>
 #endif
 #include <sys/ck.h>
 
 #define	in6pcb		inpcb	/* for KAME src sync over BSD*'s */
 #define	in6p_sp		inp_sp	/* for KAME src sync over BSD*'s */
 
 /*
  * struct inpcb is the common protocol control block structure used in most
  * IP transport protocols.
  *
  * Pointers to local and foreign host table entries, local and foreign socket
  * numbers, and pointers up (to a socket structure) and down (to a
  * protocol-specific control block) are stored here.
  */
 CK_LIST_HEAD(inpcbhead, inpcb);
 CK_LIST_HEAD(inpcbporthead, inpcbport);
 typedef	uint64_t	inp_gen_t;
 
 /*
  * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
  * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
  * the following structure.
  */
 struct in_addr_4in6 {
 	u_int32_t	ia46_pad32[3];
 	struct	in_addr	ia46_addr4;
 };
 
 union in_dependaddr {
 	struct in_addr_4in6 id46_addr;
 	struct in6_addr	id6_addr;
 };
 
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
  * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
  * lport, faddr to generate hash, so these fields shouldn't be moved.
  */
 struct in_endpoints {
 	u_int16_t	ie_fport;		/* foreign port */
 	u_int16_t	ie_lport;		/* local port */
 	/* protocol dependent part, local and foreign addr */
 	union in_dependaddr ie_dependfaddr;	/* foreign host table entry */
 	union in_dependaddr ie_dependladdr;	/* local host table entry */
 #define	ie_faddr	ie_dependfaddr.id46_addr.ia46_addr4
 #define	ie_laddr	ie_dependladdr.id46_addr.ia46_addr4
 #define	ie6_faddr	ie_dependfaddr.id6_addr
 #define	ie6_laddr	ie_dependladdr.id6_addr
 	u_int32_t	ie6_zoneid;		/* scope zone id */
 };
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
  * references.
  */
 struct in_conninfo {
 	u_int8_t	inc_flags;
 	u_int8_t	inc_len;
 	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
 	/* protocol dependent part */
 	struct	in_endpoints inc_ie;
 };
 
 /*
  * Flags for inc_flags.
  */
 #define	INC_ISIPV6	0x01
 
 #define	inc_isipv6	inc_flags	/* temp compatibility */
 #define	inc_fport	inc_ie.ie_fport
 #define	inc_lport	inc_ie.ie_lport
 #define	inc_faddr	inc_ie.ie_faddr
 #define	inc_laddr	inc_ie.ie_laddr
 #define	inc6_faddr	inc_ie.ie6_faddr
 #define	inc6_laddr	inc_ie.ie6_laddr
 #define	inc6_zoneid	inc_ie.ie6_zoneid
 
 #if defined(_KERNEL) || defined(_WANT_INPCB)
 /*
  * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
  * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
  * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
  * are static after creation or protected by a per-inpcb rwlock, inp_lock.  A
  * few fields are protected by multiple locks as indicated in the locking notes
  * below.  For these fields, all of the listed locks must be write-locked for
  * any modifications.  However, these fields can be safely read while any one of
  * the listed locks are read-locked.  This model can permit greater concurrency
  * for read operations.  For example, connections can be looked up while only
  * holding a read lock on the global pcblist lock.  This is important for
  * performance when attempting to find the connection for a packet given its IP
  * and port tuple.
  *
  * One noteworthy exception is that the global pcbinfo lock follows a different
  * set of rules in relation to the inp_list field.  Rather than being
  * write-locked for modifications and read-locked for list iterations, it must
  * be read-locked during modifications and write-locked during list iterations.
  * This ensures that the relatively rare global list iterations safely walk a
  * stable snapshot of connections while allowing more common list modifications
  * to safely grab the pcblist lock just while adding or removing a connection
  * from the global list.
  *
  * Key:
  * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
+ * (e) - Protected by the net_epoch_prempt epoch
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
  * (l) - Protected by the pcblist lock for the inpcb
  * (h) - Protected by the pcbhash lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
  * 
  * Notes on the tcp_hpts:
  * 
  * First Hpts lock order is
  * 1) INP_WLOCK()
  * 2) HPTS_LOCK() i.e. hpts->pmtx 
  *
  * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). 
  * You may check the inp->inp_in_hpts flag without the hpts lock. 
  * The hpts is the only one that will clear this flag holding 
  * only the hpts lock. This means that in your tcp_output()
  * routine when you test for the inp_in_hpts flag to be 1 
  * it may be transitioning to 0 (by the hpts). 
  * That's ok since that will just mean an extra call to tcp_output 
  * that most likely will find the call you executed
  * (when the mis-match occured) will have put the TCB back 
  * on the hpts and it will return. If your
  * call did not add the inp back to the hpts then you will either
  * over-send or the cwnd will block you from sending more.
  *
  * Note you should also be holding the INP_WLOCK() when you
  * call the remove from the hpts as well. Though usually
  * you are either doing this from a timer, where you need and have
  * the INP_WLOCK() or from destroying your TCB where again
  * you should already have the INP_WLOCK().
  *
  * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and 
  * inp_input_cpu_set fields are controlled completely by
  * the hpts. Do not ever set these. The inp_hpts_cpu_set
  * and inp_input_cpu_set fields indicate if the hpts has
  * setup the respective cpu field. It is advised if this
  * field is 0, to enqueue the packet with the appropriate
  * hpts_immediate() call. If the _set field is 1, then
  * you may compare the inp_*_cpu field to the curcpu and
  * may want to again insert onto the hpts if these fields
  * are not equal (i.e. you are not on the expected CPU).
  *
  * A note on inp_hpts_calls and inp_input_calls, these
  * flags are set when the hpts calls either the output
  * or do_segment routines respectively. If the routine
  * being called wants to use this, then it needs to
  * clear the flag before returning. The hpts will not
  * clear the flag. The flags can be used to tell if
  * the hpts is the function calling the respective
  * routine.
  *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
  * to a field, a write lock must generally be held.
  *
  * netinet/netinet6-layer code should not assume that the inp_socket pointer
  * is safe to dereference without inp_lock being held, even for protocols
  * other than TCP (where the inpcb persists during TIMEWAIT even after the
  * socket has been freed), or there may be close(2)-related races.
  *
  * The inp_vflag field is overloaded, and would otherwise ideally be (c).
  *
  * TODO:  Currently only the TCP stack is leveraging the global pcbinfo lock
  * read-lock usage during modification, this model can be applied to other
  * protocols (especially SCTP).
  */
 struct icmp6_filter;
 struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
 	/* Cache line #1 (amd64) */
-	CK_LIST_ENTRY(inpcb) inp_hash;	/* (h/i) hash list */
+	CK_LIST_ENTRY(inpcb) inp_hash;	/* [w](h/i) [r](e/i)  hash list */
 	CK_LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
 #define	inp_start_zero	inp_hpts
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
 	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
 
 	uint32_t inp_hpts_request;	/* Current hpts request, zero if
 					 * fits in the pacing window (i&b). */
 	/*
 	 * Note the next fields are protected by a
 	 * different lock (hpts-lock). This means that 
 	 * they must correspond in size to the smallest
 	 * protectable bit field (uint8_t on x86, and
 	 * other platfomrs potentially uint32_t?). Also
 	 * since CPU switches can occur at different times the two
 	 * fields can *not* be collapsed into a signal bit field.
 	 */
 #if defined(__amd64__) || defined(__i386__)	
 	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
 	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
 #else
 	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
 	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
 #endif
 	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
 	volatile uint16_t  inp_input_cpu; /* Lock (i) */
 	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
 			 inp_input_cpu_set : 1,	/* on input hpts (i) */
 			 inp_hpts_calls :1,	/* (i) from output hpts */
 			 inp_input_calls :1,	/* (i) from input hpts */
 			 inp_spare_bits2 : 4;
 	uint8_t inp_spare_byte;		/* Compiler hole */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
 	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
 	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
 	CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
 	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
 	u_char	inp_ip_ttl;		/* (i) time to live proto */
 	u_char	inp_ip_p;		/* (c) protocol proto */
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_flowid;		/* (x) flow id / queue id */
 	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
 	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
 
 	/* Local and foreign ports, local and foreign addr. */
 	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
 
 	/* MAC and IPSEC policy information. */
 	struct	label *inp_label;	/* (i) MAC label */
 	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
 
 	/* Protocol-dependent part; options. */
 	struct {
 		u_char	inp_ip_tos;		/* (i) type of service proto */
 		struct mbuf		*inp_options;	/* (i) IP options */
 		struct ip_moptions	*inp_moptions;	/* (i) mcast options */
 	};
 	struct {
 		/* (i) IP options */
 		struct mbuf		*in6p_options;
 		/* (i) IP6 options for outgoing packets */
 		struct ip6_pktopts	*in6p_outputopts;
 		/* (i) IP multicast options */
 		struct ip6_moptions	*in6p_moptions;
 		/* (i) ICMPv6 code type filter */
 		struct icmp6_filter	*in6p_icmp6filt;
 		/* (i) IPV6_CHECKSUM setsockopt */
 		int	in6p_cksum;
 		short	in6p_hops;
 	};
 	CK_LIST_ENTRY(inpcb) inp_portlist;	/* (i/h) */
 	struct	inpcbport *inp_phd;	/* (i/h) head of this list */
 	inp_gen_t	inp_gencnt;	/* (c) generation count */
 	struct llentry	*inp_lle;	/* cached L2 information */
 	rt_gen_t	inp_rt_cookie;	/* generation for route entry */
 	union {				/* cached L3 information */
 		struct route inp_route;
 		struct route_in6 inp_route6;
 	};
 	CK_LIST_ENTRY(inpcb) inp_list;	/* (p/l) list for all PCBs for proto */
-	                                /* (p[w]) for list iteration */
-	                                /* (p[r]/l) for addition/removal */
+	                                /* (e[r]) for list iteration */
+	                                /* (p[w]/l) for addition/removal */
 	struct epoch_context inp_epoch_ctx;
 };
 #endif	/* _KERNEL */
 
 #define	inp_fport	inp_inc.inc_fport
 #define	inp_lport	inp_inc.inc_lport
 #define	inp_faddr	inp_inc.inc_faddr
 #define	inp_laddr	inp_inc.inc_laddr
 
 #define	in6p_faddr	inp_inc.inc6_faddr
 #define	in6p_laddr	inp_inc.inc6_laddr
 #define	in6p_zoneid	inp_inc.inc6_zoneid
 #define	in6p_flowinfo	inp_flow
 
 #define	inp_vnet	inp_pcbinfo->ipi_vnet
 
 /*
  * The range of the generation count, as used in this implementation, is 9e19.
  * We would have to create 300 billion connections per second for this number
  * to roll over in a year.  This seems sufficiently unlikely that we simply
  * don't concern ourselves with that possibility.
  */
 
 /*
  * Interface exported to userland by various protocols which use inpcbs.  Hack
  * alert -- only define if struct xsocket is in scope.
  * Fields prefixed with "xi_" are unique to this structure, and the rest
  * match fields in the struct inpcb, to ease coding and porting.
  *
  * Legend:
  * (s) - used by userland utilities in src
  * (p) - used by utilities in ports
  * (3) - is known to be used by third party software not in ports
  * (n) - no known usage
  */
 #ifdef _SYS_SOCKETVAR_H_
 struct xinpcb {
 	size_t		xi_len;		/* length of this structure */
 	struct xsocket	xi_socket;		/* (s,p) */
 	struct in_conninfo inp_inc;		/* (s,p) */
 	uint64_t	inp_gencnt;		/* (s,p) */
 	union {
 		void	*inp_ppcb;		/* (s) netstat(1) */
 		int64_t	ph_ppcb;
 	};
 	int64_t		inp_spare64[4];
 	uint32_t	inp_flow;		/* (s) */
 	uint32_t	inp_flowid;		/* (s) */
 	uint32_t	inp_flowtype;		/* (s) */
 	int32_t		inp_flags;		/* (s,p) */
 	int32_t		inp_flags2;		/* (s) */
 	int32_t		inp_rss_listen_bucket;	/* (n) */
 	int32_t		in6p_cksum;		/* (n) */
 	int32_t		inp_spare32[4];
 	uint16_t	in6p_hops;		/* (n) */
 	uint8_t		inp_ip_tos;		/* (n) */
 	int8_t		pad8;
 	uint8_t		inp_vflag;		/* (s,p) */
 	uint8_t		inp_ip_ttl;		/* (n) */
 	uint8_t		inp_ip_p;		/* (n) */
 	uint8_t		inp_ip_minttl;		/* (n) */
 	int8_t		inp_spare8[4];
 } __aligned(8);
 
 struct xinpgen {
 	size_t		xig_len;	/* length of this structure */
 	u_int		xig_count;	/* number of PCBs at this time */
 	inp_gen_t	xig_gen;	/* generation count at this time */
 	so_gen_t	xig_sogen;	/* socket generation count this time */
 } __aligned(8);
 #ifdef	_KERNEL
 void	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
 #endif
 #endif /* _SYS_SOCKETVAR_H_ */
 
 struct inpcbport {
 	struct epoch_context phd_epoch_ctx;
 	CK_LIST_ENTRY(inpcbport) phd_hash;
 	struct inpcbhead phd_pcblist;
 	u_short phd_port;
 };
 
 struct in_pcblist {
 	int il_count;
 	struct epoch_context il_epoch_ctx;
 	struct inpcbinfo *il_pcbinfo;
 	struct inpcb *il_inp_list[0];
 };
 
 /*-
  * Global data structure for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
  *
  * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
  * ipi_list_lock:
  *  - ipi_lock covering the global pcb list stability during loop iteration,
  *  - ipi_hash_lock covering the hashed lookup tables,
  *  - ipi_list_lock covering mutable global fields (such as the global
  *    pcb list)
  *
  * The lock order is:
  *
  *    ipi_lock (before)
  *        inpcb locks (before)
  *            ipi_list locks (before)
  *                {ipi_hash_lock, pcbgroup locks}
  *
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
+ * (e) - Protected by the net_epoch_prempt epoch
  * (g) Locked by ipi_lock
  * (l) Locked by ipi_list_lock
- * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
  * (p) Protected by one or more pcbgroup locks
  * (x) Synchronisation properties poorly defined
  */
 struct inpcbinfo {
 	/*
-	 * Global lock protecting full inpcb list traversal
+	 * Global lock protecting inpcb list modification
 	 */
-	struct rwlock		 ipi_lock;
+	struct mtx		 ipi_lock;
 
 	/*
 	 * Global list of inpcbs on the protocol.
 	 */
-	struct inpcbhead	*ipi_listhead;		/* (g/l) */
+	struct inpcbhead	*ipi_listhead;		/* [r](e) [w](g/l) */
 	u_int			 ipi_count;		/* (l) */
 
 	/*
 	 * Generation count -- incremented each time a connection is allocated
 	 * or freed.
 	 */
 	u_quad_t		 ipi_gencnt;		/* (l) */
 
 	/*
 	 * Fields associated with port lookup and allocation.
 	 */
 	u_short			 ipi_lastport;		/* (x) */
 	u_short			 ipi_lastlow;		/* (x) */
 	u_short			 ipi_lasthi;		/* (x) */
 
 	/*
 	 * UMA zone from which inpcbs are allocated for this protocol.
 	 */
 	struct	uma_zone	*ipi_zone;		/* (c) */
 
 	/*
 	 * Connection groups associated with this protocol.  These fields are
 	 * constant, but pcbgroup structures themselves are protected by
 	 * per-pcbgroup locks.
 	 */
 	struct inpcbgroup	*ipi_pcbgroups;		/* (c) */
 	u_int			 ipi_npcbgroups;	/* (c) */
 	u_int			 ipi_hashfields;	/* (c) */
 
 	/*
-	 * Global lock protecting non-pcbgroup hash lookup tables.
+	 * Global lock protecting modification non-pcbgroup hash lookup tables.
 	 */
-	struct rwlock		 ipi_hash_lock;
+	struct mtx		 ipi_hash_lock;
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
 	 * port numbers.
 	 */
 	struct inpcbhead	*ipi_hashbase;		/* (h) */
 	u_long			 ipi_hashmask;		/* (h) */
 
 	/*
 	 * Global hash of inpcbs, hashed by only local port number.
 	 */
 	struct inpcbporthead	*ipi_porthashbase;	/* (h) */
 	u_long			 ipi_porthashmask;	/* (h) */
 
 	/*
 	 * List of wildcard inpcbs for use with pcbgroups.  In the past, was
 	 * per-pcbgroup but is now global.  All pcbgroup locks must be held
 	 * to modify the list, so any is sufficient to read it.
 	 */
 	struct inpcbhead	*ipi_wildbase;		/* (p) */
 	u_long			 ipi_wildmask;		/* (p) */
 
 	/*
 	 * Load balance groups used for the SO_REUSEPORT_LB option,
 	 * hashed by local port.
 	 */
 	struct	inpcblbgrouphead *ipi_lbgrouphashbase;	/* (h) */
 	u_long			 ipi_lbgrouphashmask;	/* (h) */
 
 	/*
 	 * Pointer to network stack instance
 	 */
 	struct vnet		*ipi_vnet;		/* (c) */
 
 	/*
 	 * general use 2
 	 */
 	void 			*ipi_pspare[2];
 
 	/*
 	 * Global lock protecting global inpcb list, inpcb count, etc.
 	 */
 	struct rwlock		 ipi_list_lock;
 };
 
 #ifdef _KERNEL
 /*
  * Connection groups hold sets of connections that have similar CPU/thread
  * affinity.  Each connection belongs to exactly one connection group.
  */
 struct inpcbgroup {
 	/*
 	 * Per-connection group hash of inpcbs, hashed by local and foreign
 	 * addresses and port numbers.
 	 */
 	struct inpcbhead	*ipg_hashbase;		/* (c) */
 	u_long			 ipg_hashmask;		/* (c) */
 
 	/*
 	 * Notional affinity of this pcbgroup.
 	 */
 	u_int			 ipg_cpu;		/* (p) */
 
 	/*
 	 * Per-connection group lock, not to be confused with ipi_lock.
 	 * Protects the hash table hung off the group, but also the global
 	 * wildcard list in inpcbinfo.
 	 */
 	struct mtx		 ipg_lock;
 } __aligned(CACHE_LINE_SIZE);
 
 /*
  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
  * (or unique address:port combination) can be re-used at most
  * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
  * is dynamically resized as processes bind/unbind to that specific group.
  */
 struct inpcblbgroup {
 	LIST_ENTRY(inpcblbgroup) il_list;
 	uint16_t	il_lport;			/* (c) */
 	u_char		il_vflag;			/* (c) */
 	u_char		il_pad;
 	uint32_t	il_pad2;
 	union in_dependaddr il_dependladdr;		/* (c) */
 #define	il_laddr	il_dependladdr.id46_addr.ia46_addr4
 #define	il6_laddr	il_dependladdr.id6_addr
 	uint32_t	il_inpsiz; /* max count in il_inp[] (h) */
 	uint32_t	il_inpcnt; /* cur count in il_inp[] (h) */
 	struct inpcb	*il_inp[];			/* (h) */
 };
 LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
 
 #define INP_LOCK_INIT(inp, d, t) \
 	rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE |  RW_DUPOK)
 #define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
 #define INP_RLOCK(inp)		rw_rlock(&(inp)->inp_lock)
 #define INP_WLOCK(inp)		rw_wlock(&(inp)->inp_lock)
 #define INP_TRY_RLOCK(inp)	rw_try_rlock(&(inp)->inp_lock)
 #define INP_TRY_WLOCK(inp)	rw_try_wlock(&(inp)->inp_lock)
 #define INP_RUNLOCK(inp)	rw_runlock(&(inp)->inp_lock)
 #define INP_WUNLOCK(inp)	rw_wunlock(&(inp)->inp_lock)
 #define	INP_TRY_UPGRADE(inp)	rw_try_upgrade(&(inp)->inp_lock)
 #define	INP_DOWNGRADE(inp)	rw_downgrade(&(inp)->inp_lock)
 #define	INP_WLOCKED(inp)	rw_wowned(&(inp)->inp_lock)
 #define	INP_LOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_LOCKED)
 #define	INP_RLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_RLOCKED)
 #define	INP_WLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_WLOCKED)
 #define	INP_UNLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
 
 /*
  * These locking functions are for inpcb consumers outside of sys/netinet,
  * more specifically, they were added for the benefit of TOE drivers. The
  * macros are reserved for use by the stack.
  */
 void inp_wlock(struct inpcb *);
 void inp_wunlock(struct inpcb *);
 void inp_rlock(struct inpcb *);
 void inp_runlock(struct inpcb *);
 
 #ifdef INVARIANT_SUPPORT
 void inp_lock_assert(struct inpcb *);
 void inp_unlock_assert(struct inpcb *);
 #else
 #define	inp_lock_assert(inp)	do {} while (0)
 #define	inp_unlock_assert(inp)	do {} while (0)
 #endif
 
 void	inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
 int 	inp_ip_tos_get(const struct inpcb *inp);
 void 	inp_ip_tos_set(struct inpcb *inp, int val);
 struct socket *
 	inp_inpcbtosocket(struct inpcb *inp);
 struct tcpcb *
 	inp_inpcbtotcpcb(struct inpcb *inp);
 void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 		uint32_t *faddr, uint16_t *fp);
 int	inp_so_options(const struct inpcb *inp);
 
 #endif /* _KERNEL */
 
 #define INP_INFO_LOCK_INIT(ipi, d) \
-	rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_RLOCK(ipi)	rw_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi)	rw_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_RLOCK(ipi)	rw_try_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_WLOCK(ipi)	rw_try_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_UPGRADE(ipi)	rw_try_upgrade(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCKED(ipi)	rw_wowned(&(ipi)->ipi_lock)
-#define INP_INFO_RUNLOCK(ipi)	rw_runlock(&(ipi)->ipi_lock)
-#define INP_INFO_WUNLOCK(ipi)	rw_wunlock(&(ipi)->ipi_lock)
-#define	INP_INFO_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
-#define INP_INFO_RLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
-#define INP_INFO_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
-#define INP_INFO_UNLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+	mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
+#define INP_INFO_LOCK_DESTROY(ipi)  mtx_destroy(&(ipi)->ipi_lock)
+#define INP_INFO_RLOCK(ipi)	NET_EPOCH_ENTER()
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_WLOCK(ipi)	mtx_trylock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi)	mtx_owned(&(ipi)->ipi_lock)
+#define INP_INFO_RUNLOCK(ipi)	NET_EPOCH_EXIT()
+#define INP_INFO_WUNLOCK(ipi)	mtx_unlock(&(ipi)->ipi_lock)
+#define	INP_INFO_LOCK_ASSERT(ipi)	MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_RLOCK_ASSERT(ipi)	MPASS(in_epoch())
+#define INP_INFO_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
+#define INP_INFO_UNLOCK_ASSERT(ipi)	MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock))
 
 #define INP_LIST_LOCK_INIT(ipi, d) \
         rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
 #define INP_LIST_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_list_lock)
 #define INP_LIST_RLOCK(ipi)     rw_rlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_WLOCK(ipi)     rw_wlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_UPGRADE(ipi)       rw_try_upgrade(&(ipi)->ipi_list_lock)
 #define INP_LIST_RUNLOCK(ipi)   rw_runlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_WUNLOCK(ipi)   rw_wunlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_LOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
 #define INP_LIST_RLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
 #define INP_LIST_WLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
 #define INP_LIST_UNLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
 
-#define	INP_HASH_LOCK_INIT(ipi, d) \
-	rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
-#define	INP_HASH_LOCK_DESTROY(ipi)	rw_destroy(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_RLOCK(ipi)		rw_rlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_WLOCK(ipi)		rw_wlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_RUNLOCK(ipi)		rw_runlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_WUNLOCK(ipi)		rw_wunlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
-					    RA_LOCKED)
-#define	INP_HASH_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
-					    RA_WLOCKED)
+#define	INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
+#define	INP_HASH_LOCK_DESTROY(ipi)	mtx_destroy(&(ipi)->ipi_hash_lock)
+#define	INP_HASH_RLOCK(ipi)		NET_EPOCH_ENTER()
+#define	INP_HASH_WLOCK(ipi)		mtx_lock(&(ipi)->ipi_hash_lock)
+#define	INP_HASH_RUNLOCK(ipi)		NET_EPOCH_EXIT()
+#define	INP_HASH_WUNLOCK(ipi)		mtx_unlock(&(ipi)->ipi_hash_lock)
+#define	INP_HASH_LOCK_ASSERT(ipi)	MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock))
+#define	INP_HASH_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
 
 #define	INP_GROUP_LOCK_INIT(ipg, d)	mtx_init(&(ipg)->ipg_lock, (d), NULL, \
 					    MTX_DEF | MTX_DUPOK)
 #define	INP_GROUP_LOCK_DESTROY(ipg)	mtx_destroy(&(ipg)->ipg_lock)
 
 #define	INP_GROUP_LOCK(ipg)		mtx_lock(&(ipg)->ipg_lock)
 #define	INP_GROUP_LOCK_ASSERT(ipg)	mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
 #define	INP_GROUP_UNLOCK(ipg)		mtx_unlock(&(ipg)->ipg_lock)
 
 #define INP_PCBHASH(faddr, lport, fport, mask) \
 	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
 	(ntohs((lport)) & (mask))
 #define	INP_PCBLBGROUP_PORTHASH(lport, mask) \
 	(ntohs((lport)) & (mask))
 #define	INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
 	((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
 #define	INP6_PCBHASHKEY(faddr)	((faddr)->s6_addr32[3])
 
 /*
  * Flags for inp_vflags -- historically version flags only
  */
 #define	INP_IPV4	0x1
 #define	INP_IPV6	0x2
 #define	INP_IPV6PROTO	0x4		/* opened under IPv6 protocol */
 
 /*
  * Flags for inp_flags.
  */
 #define	INP_RECVOPTS		0x00000001 /* receive incoming IP options */
 #define	INP_RECVRETOPTS		0x00000002 /* receive IP options for reply */
 #define	INP_RECVDSTADDR		0x00000004 /* receive IP dst address */
 #define	INP_HDRINCL		0x00000008 /* user supplies entire IP header */
 #define	INP_HIGHPORT		0x00000010 /* user wants "high" port binding */
 #define	INP_LOWPORT		0x00000020 /* user wants "low" port binding */
 #define	INP_ANONPORT		0x00000040 /* port chosen for user */
 #define	INP_RECVIF		0x00000080 /* receive incoming interface */
 #define	INP_MTUDISC		0x00000100 /* user can do MTU discovery */
 				   	   /* 0x000200 unused: was INP_FAITH */
 #define	INP_RECVTTL		0x00000400 /* receive incoming IP TTL */
 #define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
 #define	INP_BINDANY		0x00001000 /* allow bind to any address */
 #define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
 #define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
 #define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
 #define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
 #define	IN6P_HOPOPTS		0x00040000 /* receive hop-by-hop options */
 #define	IN6P_DSTOPTS		0x00080000 /* receive dst options after rthdr */
 #define	IN6P_RTHDR		0x00100000 /* receive routing header */
 #define	IN6P_RTHDRDSTOPTS	0x00200000 /* receive dstoptions before rthdr */
 #define	IN6P_TCLASS		0x00400000 /* receive traffic class value */
 #define	IN6P_AUTOFLOWLABEL	0x00800000 /* attach flowlabel automatically */
 #define	INP_TIMEWAIT		0x01000000 /* in TIMEWAIT, ppcb is tcptw */
 #define	INP_ONESBCAST		0x02000000 /* send all-ones broadcast */
 #define	INP_DROPPED		0x04000000 /* protocol drop flag */
 #define	INP_SOCKREF		0x08000000 /* strong socket reference */
 #define	INP_RESERVED_0          0x10000000 /* reserved field */
 #define	INP_RESERVED_1          0x20000000 /* reserved field */
 #define	IN6P_RFC2292		0x40000000 /* used RFC2292 API on the socket */
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
 				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
 				 IN6P_MTU)
 
 /*
  * Flags for inp_flags2.
  */
 #define	INP_LLE_VALID		0x00000001 /* cached lle is valid */	
 #define	INP_RT_VALID		0x00000002 /* cached rtentry is valid */
 #define	INP_PCBGROUPWILD	0x00000004 /* in pcbgroup wildcard list */
 #define	INP_REUSEPORT		0x00000008 /* SO_REUSEPORT option is set */
 #define	INP_FREED		0x00000010 /* inp itself is not valid */
 #define	INP_REUSEADDR		0x00000020 /* SO_REUSEADDR option is set */
 #define	INP_BINDMULTI		0x00000040 /* IP_BINDMULTI option is set */
 #define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
 #define	INP_RECVFLOWID		0x00000100 /* populate recv datagram with flow info */
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
 #define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 #define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
 #define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
 #define	INP_REUSEPORT_LB	0x00002000 /* SO_REUSEPORT_LB option is set */
 
 /*
  * Flags passed to in_pcblookup*() functions.
  */
 #define	INPLOOKUP_WILDCARD	0x00000001	/* Allow wildcard sockets. */
 #define	INPLOOKUP_RLOCKPCB	0x00000002	/* Return inpcb read-locked. */
 #define	INPLOOKUP_WLOCKPCB	0x00000004	/* Return inpcb write-locked. */
 
 #define	INPLOOKUP_MASK	(INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
 			    INPLOOKUP_WLOCKPCB)
 
 #define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
 #define	sotoin6pcb(so)	sotoinpcb(so) /* for KAME src sync over BSD*'s */
 
 #define	INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
 
 #define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
 
 /*
  * Constants for pcbinfo.ipi_hashfields.
  */
 #define	IPI_HASHFIELDS_NONE	0
 #define	IPI_HASHFIELDS_2TUPLE	1
 #define	IPI_HASHFIELDS_4TUPLE	2
 
 #ifdef _KERNEL
 VNET_DECLARE(int, ipport_reservedhigh);
 VNET_DECLARE(int, ipport_reservedlow);
 VNET_DECLARE(int, ipport_lowfirstauto);
 VNET_DECLARE(int, ipport_lowlastauto);
 VNET_DECLARE(int, ipport_firstauto);
 VNET_DECLARE(int, ipport_lastauto);
 VNET_DECLARE(int, ipport_hifirstauto);
 VNET_DECLARE(int, ipport_hilastauto);
 VNET_DECLARE(int, ipport_randomized);
 VNET_DECLARE(int, ipport_randomcps);
 VNET_DECLARE(int, ipport_randomtime);
 VNET_DECLARE(int, ipport_stoprandom);
 VNET_DECLARE(int, ipport_tcpallocs);
 
 #define	V_ipport_reservedhigh	VNET(ipport_reservedhigh)
 #define	V_ipport_reservedlow	VNET(ipport_reservedlow)
 #define	V_ipport_lowfirstauto	VNET(ipport_lowfirstauto)
 #define	V_ipport_lowlastauto	VNET(ipport_lowlastauto)
 #define	V_ipport_firstauto	VNET(ipport_firstauto)
 #define	V_ipport_lastauto	VNET(ipport_lastauto)
 #define	V_ipport_hifirstauto	VNET(ipport_hifirstauto)
 #define	V_ipport_hilastauto	VNET(ipport_hilastauto)
 #define	V_ipport_randomized	VNET(ipport_randomized)
 #define	V_ipport_randomcps	VNET(ipport_randomcps)
 #define	V_ipport_randomtime	VNET(ipport_randomtime)
 #define	V_ipport_stoprandom	VNET(ipport_stoprandom)
 #define	V_ipport_tcpallocs	VNET(ipport_tcpallocs)
 
 void	in_pcbinfo_destroy(struct inpcbinfo *);
 void	in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
 	    int, int, char *, uma_init, u_int);
 
 int	in_pcbbind_check_bindmulti(const struct inpcb *ni,
 	    const struct inpcb *oi);
 
 struct inpcbgroup *
 	in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
 struct inpcbgroup *
 	in_pcbgroup_byinpcb(struct inpcb *);
 struct inpcbgroup *
 	in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short,
 	    struct in_addr, u_short);
 void	in_pcbgroup_destroy(struct inpcbinfo *);
 int	in_pcbgroup_enabled(struct inpcbinfo *);
 void	in_pcbgroup_init(struct inpcbinfo *, u_int, int);
 void	in_pcbgroup_remove(struct inpcb *);
 void	in_pcbgroup_update(struct inpcb *);
 void	in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *);
 
 void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
 int	in_pcballoc(struct socket *, struct inpcbinfo *);
 int	in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
 	    struct ucred *, int);
 int	in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, struct ucred *);
 int	in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *,
 	    struct mbuf *);
 int	in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, in_addr_t *, u_short *, struct inpcb **,
 	    struct ucred *);
 void	in_pcbdetach(struct inpcb *);
 void	in_pcbdisconnect(struct inpcb *);
 void	in_pcbdrop(struct inpcb *);
 void	in_pcbfree(struct inpcb *);
 int	in_pcbinshash(struct inpcb *);
 int	in_pcbinshash_nopcbgroup(struct inpcb *);
 int	in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
 	    struct ucred *);
 struct inpcb *
 	in_pcblookup_local(struct inpcbinfo *,
 	    struct in_addr, u_short, int, struct ucred *);
 struct inpcb *
 	in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
 struct inpcb *
 	in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
 void	in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
 int	in_pcbrele(struct inpcb *);
 int	in_pcbrele_rlocked(struct inpcb *);
 int	in_pcbrele_wlocked(struct inpcb *);
 void	in_pcblist_rele_rlocked(epoch_context_t ctx);
 void	in_losing(struct inpcb *);
 void	in_pcbsetsolabel(struct socket *so);
 int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
 int	in_getsockaddr(struct socket *so, struct sockaddr **nam);
 struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 #ifdef RATELIMIT
 int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
 void	in_pcbdetach_txrtlmt(struct inpcb *);
 int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
 int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
 int	in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
 void	in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
 void	in_pcboutput_eagain(struct inpcb *);
 #endif
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */
Index: head/sys/netinet/tcp_hpts.c
===================================================================
--- head/sys/netinet/tcp_hpts.c	(revision 335355)
+++ head/sys/netinet/tcp_hpts.c	(revision 335356)
@@ -1,1963 +1,1902 @@
 /*-
  * Copyright (c) 2016-2018 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 /**
  * Some notes about usage.
  *
  * The tcp_hpts system is designed to provide a high precision timer
  * system for tcp. Its main purpose is to provide a mechanism for 
  * pacing packets out onto the wire. It can be used in two ways
  * by a given TCP stack (and those two methods can be used simultaneously).
  *
  * First, and probably the main thing its used by Rack and BBR for, it can
  * be used to call tcp_output() of a transport stack at some time in the future.
  * The normal way this is done is that tcp_output() of the stack schedules
  * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
  * slot is the time from now that the stack wants to be called but it
  * must be converted to tcp_hpts's notion of slot. This is done with
  * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
  * call from the tcp_output() routine might look like:
  *
  * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
  *
  * The above would schedule tcp_ouput() to be called in 550 useconds.
  * Note that if using this mechanism the stack will want to add near
  * its top a check to prevent unwanted calls (from user land or the
  * arrival of incoming ack's). So it would add something like:
  *
  * if (inp->inp_in_hpts)
  *    return;
  *
  * to prevent output processing until the time alotted has gone by.
  * Of course this is a bare bones example and the stack will probably
  * have more consideration then just the above.
  *
  * Now the tcp_hpts system will call tcp_output in one of two forms, 
  * it will first check to see if the stack as defined a 
  * tfb_tcp_output_wtime() function, if so that is the routine it
  * will call, if that function is not defined then it will call the
  * tfb_tcp_output() function. The only difference between these
  * two calls is that the former passes the time in to the function
  * so the function does not have to access the time (which tcp_hpts
  * already has). What these functions do is of course totally up
  * to the individual tcp stack.
  *
  * Now the second function (actually two functions I guess :D)
  * the tcp_hpts system provides is the  ability to either abort 
  * a connection (later) or process  input on a connection. 
  * Why would you want to do this? To keep processor locality.
  *
  * So in order to use the input redirection function the
  * stack changes its tcp_do_segment() routine to instead
  * of process the data call the function:
  *
  * tcp_queue_pkt_to_input()
  *
  * You will note that the arguments to this function look
  * a lot like tcp_do_segments's arguments. This function
  * will assure that the tcp_hpts system will
  * call the functions tfb_tcp_hpts_do_segment() from the
  * correct CPU. Note that multiple calls can get pushed
  * into the tcp_hpts system this will be indicated by
  * the next to last argument to tfb_tcp_hpts_do_segment()
  * (nxt_pkt). If nxt_pkt is a 1 then another packet is
  * coming. If nxt_pkt is a 0 then this is the last call
  * that the tcp_hpts system has available for the tcp stack.
  * 
  * The other point of the input system is to be able to safely
  * drop a tcp connection without worrying about the recursive 
  * locking that may be occuring on the INP_WLOCK. So if
  * a stack wants to drop a connection it calls:
  *
  *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
  * 
  * To schedule the tcp_hpts system to call 
  * 
  *    tcp_drop(tp, drop_reason)
  *
  * at a future point. This is quite handy to prevent locking
  * issues when dropping connections.
  *
  */
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <sys/time.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_hpts.h>
 
 #ifdef tcpdebug
 #include <netinet/tcp_debug.h>
 #endif				/* tcpdebug */
 #ifdef tcp_offload
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef ipsec
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif				/* ipsec */
 #include "opt_rss.h"
 
 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #ifdef RSS
 static int tcp_bind_threads = 1;
 #else
 static int tcp_bind_threads = 0;
 #endif
 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
 
 static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
 
 TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
 
 static struct tcp_hptsi tcp_pace;
 
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp,
-    struct tcpcb **tp);
 static void tcp_wakehpts(struct tcp_hpts_entry *p);
 static void tcp_wakeinput(struct tcp_hpts_entry *p);
 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
 static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
 static void tcp_hpts_thread(void *ctx);
 static void tcp_init_hptsi(void *st);
 
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
 static int32_t tcp_hpts_callout_skip_swi = 0;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
 
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 
 static int32_t logging_on = 0;
 static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
 static int32_t tcp_hpts_precision = 120;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
     &tcp_hpts_precision, 120,
     "Value for PRE() precision of callout");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
     &logging_on, 0,
     "Turn on logging if compiled in");
 
 counter_u64_t hpts_loops;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
     &hpts_loops, "Number of times hpts had to loop to catch up");
 
 counter_u64_t back_tosleep;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
     &back_tosleep, "Number of times hpts found no tcbs");
 
 static int32_t in_newts_every_tcb = 0;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
     &in_newts_every_tcb, 0,
     "Do we have a new cts every tcb we process for input");
 static int32_t in_ts_percision = 0;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
     &in_ts_percision, 0,
     "Do we use percise timestamp for clients on input");
 static int32_t out_newts_every_tcb = 0;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
     &out_newts_every_tcb, 0,
     "Do we have a new cts every tcb we process for output");
 static int32_t out_ts_percision = 0;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
     &out_ts_percision, 0,
     "Do we use a percise timestamp for every output cts");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
     &hpts_sleep_max, 0,
     "The maximum time the hpts will sleep <1 - 254>");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
     &tcp_min_hptsi_time, 0,
     "The minimum time the hpts must sleep before processing more slots");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
     &tcp_hpts_callout_skip_swi, 0,
     "Do we have the callout call directly to the hpts?");
 
 static void
 __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
     uint32_t ticknow, int32_t line)
 {
 	struct hpts_log *pl;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (hpts->p_log == NULL)
 		return;
 	pl = &hpts->p_log[hpts->p_log_at];
 	hpts->p_log_at++;
 	if (hpts->p_log_at >= hpts->p_logsize) {
 		hpts->p_log_at = 0;
 		hpts->p_log_wrapped = 1;
 	}
 	pl->inp = inp;
 	if (inp) {
 		pl->t_paceslot = inp->inp_hptsslot;
 		pl->t_hptsreq = inp->inp_hpts_request;
 		pl->p_onhpts = inp->inp_in_hpts;
 		pl->p_oninput = inp->inp_in_input;
 	} else {
 		pl->t_paceslot = 0;
 		pl->t_hptsreq = 0;
 		pl->p_onhpts = 0;
 		pl->p_oninput = 0;
 	}
 	pl->is_notempty = 1;
 	pl->event = event;
 	pl->line = line;
 	pl->cts = tcp_get_usecs(NULL);
 	pl->p_curtick = hpts->p_curtick;
 	pl->p_prevtick = hpts->p_prevtick;
 	pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
 	pl->ticknow = ticknow;
 	pl->slot_req = slot;
 	pl->p_nxt_slot = hpts->p_nxt_slot;
 	pl->p_cur_slot = hpts->p_cur_slot;
 	pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
 	pl->p_flags = (hpts->p_cpu & 0x7f);
 	pl->p_flags <<= 7;
 	pl->p_flags |= (hpts->p_num & 0x7f);
 	pl->p_flags <<= 2;
 	if (hpts->p_hpts_active) {
 		pl->p_flags |= HPTS_HPTS_ACTIVE;
 	}
 }
 
 #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
 
 static void
 hpts_timeout_swi(void *arg)
 {
 	struct tcp_hpts_entry *hpts;
 
 	hpts = (struct tcp_hpts_entry *)arg;
 	swi_sched(hpts->ie_cookie, 0);
 }
 
 static void
 hpts_timeout_dir(void *arg)
 {
 	tcp_hpts_thread(arg);
 }
 
 static inline void
 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_hpts_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_hpts == 0) {
 		/* We are not on the hpts? */
 		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
 	}
 	if (TAILQ_EMPTY(head) &&
 	    (hpts->p_on_queue_cnt != 0)) {
 		/* We should not be empty with a queue count */
 		panic("%s hpts:%p hpts bucket empty but cnt:%d",
 		    __FUNCTION__, hpts, hpts->p_on_queue_cnt);
 	}
 #endif
 	TAILQ_REMOVE(head, inp, inp_hpts);
 	hpts->p_on_queue_cnt--;
 	if (hpts->p_on_queue_cnt < 0) {
 		/* Count should not go negative .. */
 #ifdef INVARIANTS
 		panic("Hpts goes negative inp:%p hpts:%p",
 		    inp, hpts);
 #endif
 		hpts->p_on_queue_cnt = 0;
 	}
 	if (clear) {
 		inp->inp_hpts_request = 0;
 		inp->inp_in_hpts = 0;
 	}
 }
 
 static inline void
 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_hpts_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
 		/* We are already on the hpts? */
 		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
 	inp->inp_in_hpts = 1;
 	hpts->p_on_queue_cnt++;
 	if (noref == 0) {
 		in_pcbref(inp);
 	}
 }
 
 static inline void
 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_input_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_input == 0) {
 		/* We are not on the input hpts? */
 		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
 	hpts->p_on_inqueue_cnt--;
 	if (hpts->p_on_inqueue_cnt < 0) {
 #ifdef INVARIANTS
 		panic("Hpts in goes negative inp:%p hpts:%p",
 		    inp, hpts);
 #endif
 		hpts->p_on_inqueue_cnt = 0;
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		/* We should not be empty with a queue count */
 		panic("%s hpts:%p in_hpts input empty but cnt:%d",
 		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	if (clear)
 		inp->inp_in_input = 0;
 }
 
 static inline void
 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_input_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_input == 1) {
 		/* We are already on the input hpts? */
 		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
 	inp->inp_in_input = 1;
 	hpts->p_on_inqueue_cnt++;
 	in_pcbref(inp);
 }
 
 static int
 sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
 {
 	struct tcp_hpts_entry *hpts;
 	size_t sz;
 	int32_t logging_was, i;
 	int32_t error = 0;
 
 	/*
 	 * HACK: Turn off logging so no locks are required this really needs
 	 * a memory barrier :)
 	 */
 	logging_was = logging_on;
 	logging_on = 0;
 	if (!req->oldptr) {
 		/* How much? */
 		sz = 0;
 		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 			hpts = tcp_pace.rp_ent[i];
 			if (hpts->p_log == NULL)
 				continue;
 			sz += (sizeof(struct hpts_log) * hpts->p_logsize);
 		}
 		error = SYSCTL_OUT(req, 0, sz);
 	} else {
 		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 			hpts = tcp_pace.rp_ent[i];
 			if (hpts->p_log == NULL)
 				continue;
 			if (hpts->p_log_wrapped)
 				sz = (sizeof(struct hpts_log) * hpts->p_logsize);
 			else
 				sz = (sizeof(struct hpts_log) * hpts->p_log_at);
 			error = SYSCTL_OUT(req, hpts->p_log, sz);
 		}
 	}
 	logging_on = logging_was;
 	return error;
 }
 
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
 
 
-/*
- * Try to get the INP_INFO lock.
- *
- * This function always succeeds in getting the lock. It will clear
- * *tpp and return (1) if something critical changed while the inpcb
- * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
- *
- * This function relies on the fact that the hpts always holds a
- * reference on the inpcb while the segment is on the hptsi wheel and
- * in the input queue.
- *
- */
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
-{
-	struct tcp_function_block *tfb;
-	struct tcpcb *tp;
-	void *ptr;
-
-	/* Try the easy way. */
-	if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
-		return (0);
-
-	/*
-	 * OK, let's try the hard way. We'll save the function pointer block
-	 * to make sure that doesn't change while we aren't holding the
-	 * lock.
-	 */
-	tp = *tpp;
-	tfb = tp->t_fb;
-	ptr = tp->t_fb_ptr;
-	INP_WUNLOCK(inp);
-	INP_INFO_RLOCK(&V_tcbinfo);
-	INP_WLOCK(inp);
-	/* If the session went away, return an error. */
-	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
-	    (inp->inp_flags2 & INP_FREED)) {
-		*tpp = NULL;
-		return (1);
-	}
-	/*
-	 * If the function block or stack-specific data block changed,
-	 * report an error.
-	 */
-	tp = intotcpcb(inp);
-	if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
-		*tpp = NULL;
-		return (1);
-	}
-	return (0);
-}
-
-
 static void
 tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
 	swi_sched(hpts->ie_cookie, 0);
 	if (hpts->p_hpts_active == 2) {
 		/* Rare sleeping on a ENOBUF */
 		wakeup_one(hpts);
 	}
 }
 
 static void
 tcp_wakeinput(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
 	swi_sched(hpts->ie_cookie, 0);
 	if (hpts->p_hpts_active == 2) {
 		/* Rare sleeping on a ENOBUF */
 		wakeup_one(hpts);
 	}
 }
 
 struct tcp_hpts_entry *
 tcp_cur_hpts(struct inpcb *inp)
 {
 	int32_t hpts_num;
 	struct tcp_hpts_entry *hpts;
 
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 	return (hpts);
 }
 
 struct tcp_hpts_entry *
 tcp_hpts_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx)) {
 		panic("Hpts:%p owns mtx prior-to lock line:%d",
 		    hpts, __LINE__);
 	}
 #endif
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_hpts_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
 	return (hpts);
 }
 
 struct tcp_hpts_entry *
 tcp_input_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
 	hpts_num = inp->inp_input_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx)) {
 		panic("Hpts:%p owns mtx prior-to lock line:%d",
 		    hpts, __LINE__);
 	}
 #endif
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_input_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
 	return (hpts);
 }
 
 static void
 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
 {
 	int32_t add_freed;
 
 	if (inp->inp_flags2 & INP_FREED) {
 		/*
 		 * Need to play a special trick so that in_pcbrele_wlocked
 		 * does not return 1 when it really should have returned 0.
 		 */
 		add_freed = 1;
 		inp->inp_flags2 &= ~INP_FREED;
 	} else {
 		add_freed = 0;
 	}
 #ifndef INP_REF_DEBUG
 	if (in_pcbrele_wlocked(inp)) {
 		/*
 		 * This should not happen. We have the inpcb referred to by
 		 * the main socket (why we are called) and the hpts. It
 		 * should always return 0.
 		 */
 		panic("inpcb:%p release ret 1",
 		    inp);
 	}
 #else
 	if (__in_pcbrele_wlocked(inp, line)) {
 		/*
 		 * This should not happen. We have the inpcb referred to by
 		 * the main socket (why we are called) and the hpts. It
 		 * should always return 0.
 		 */
 		panic("inpcb:%p release ret 1",
 		    inp);
 	}
 #endif
 	if (add_freed) {
 		inp->inp_flags2 |= INP_FREED;
 	}
 }
 
 static void
 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
 {
 	if (inp->inp_in_hpts) {
 		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
 		tcp_remove_hpts_ref(inp, hpts, line);
 	}
 }
 
 static void
 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
 {
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_input) {
 		hpts_sane_input_remove(hpts, inp, 1);
 		tcp_remove_hpts_ref(inp, hpts, line);
 	}
 }
 
 /*
  * Called normally with the INP_LOCKED but it
  * does not matter, the hpts lock is the key
  * but the lock order allows us to hold the
  * INP lock and then get the hpts lock.
  *
  * Valid values in the flags are
  * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
  * HPTS_REMOVE_INPUT - remove from the input of the hpts.
  * Note that you can or both values together and get two
  * actions.
  */
 void
 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	if (flags & HPTS_REMOVE_OUTPUT) {
 		hpts = tcp_hpts_lock(inp);
 		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
 	if (flags & HPTS_REMOVE_INPUT) {
 		hpts = tcp_input_lock(inp);
 		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
 }
 
 static inline int
 hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
 {
 	return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
 }
 
 static int
 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
 {
 	int32_t need_wake = 0;
 	uint32_t ticknow = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_hpts == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		if (hpts->p_hpts_active == 0) {
 			/* A sleeping hpts we want in next slot to run */
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
 				    hpts_tick(hpts, 1));
 			}
 			inp->inp_hptsslot = hpts_tick(hpts, 1);
 			inp->inp_hpts_request = 0;
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
 			}
 			need_wake = 1;
 		} else if ((void *)inp == hpts->p_inp) {
 			/*
 			 * We can't allow you to go into the same slot we
 			 * are in. We must put you out.
 			 */
 			inp->inp_hptsslot = hpts->p_nxt_slot;
 		} else
 			inp->inp_hptsslot = hpts->p_cur_slot;
 		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
 		inp->inp_hpts_request = 0;
 		if (logging_on) {
 			tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
 		}
 		if (need_wake) {
 			/*
 			 * Activate the hpts if it is sleeping and its
 			 * timeout is not 1.
 			 */
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
 			}
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 		}
 	}
 	return (need_wake);
 }
 
 int
 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
 {
 	int32_t ret;
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
 	mtx_unlock(&hpts->p_mtx);
 	return (ret);
 }
 
 static void
 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
     struct hpts_diag *diag, int32_t noref)
 {
 	int32_t need_new_to = 0;
 	int32_t need_wakeup = 0;
 	uint32_t largest_slot;
 	uint32_t ticknow = 0;
 	uint32_t slot_calc;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (diag) {
 		memset(diag, 0, sizeof(struct hpts_diag));
 		diag->p_hpts_active = hpts->p_hpts_active;
 		diag->p_nxt_slot = hpts->p_nxt_slot;
 		diag->p_cur_slot = hpts->p_cur_slot;
 		diag->slot_req = slot;
 	}
 	if ((inp->inp_in_hpts == 0) || noref) {
 		inp->inp_hpts_request = slot;
 		if (slot == 0) {
 			/* Immediate */
 			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
 			return;
 		}
 		if (hpts->p_hpts_active) {
 			/*
 			 * Its slot - 1 since nxt_slot is the next tick that
 			 * will go off since the hpts is awake
 			 */
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
 			}
 			/*
 			 * We want to make sure that we don't place a inp in
 			 * the range of p_cur_slot <-> p_nxt_slot. If we
 			 * take from p_nxt_slot to the end, plus p_cur_slot
 			 * and then take away 2, we will know how many is
 			 * the max slots we can use.
 			 */
 			if (hpts->p_nxt_slot > hpts->p_cur_slot) {
 				/*
 				 * Non-wrap case nxt_slot <-> cur_slot we
 				 * don't want to land in. So the diff gives
 				 * us what is taken away from the number of
 				 * slots.
 				 */
 				largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
 			} else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
 				largest_slot = NUM_OF_HPTSI_SLOTS - 2;
 			} else {
 				/*
 				 * Wrap case so the diff gives us the number
 				 * of slots that we can land in.
 				 */
 				largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
 			}
 			/*
 			 * We take away two so we never have a problem (20
 			 * usec's) out of 1024000 usecs
 			 */
 			largest_slot -= 2;
 			if (inp->inp_hpts_request > largest_slot) {
 				/*
 				 * Restrict max jump of slots and remember
 				 * leftover
 				 */
 				slot = largest_slot;
 				inp->inp_hpts_request -= largest_slot;
 			} else {
 				/* This one will run when we hit it */
 				inp->inp_hpts_request = 0;
 			}
 			if (hpts->p_nxt_slot == hpts->p_cur_slot)
 				slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
 			else
 				slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
 			if (slot_calc == hpts->p_cur_slot) {
 #ifdef INVARIANTS
 				/* TSNH */
 				panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
 				    hpts, slot_calc, slot, largest_slot);
 #endif
 				if (slot_calc)
 					slot_calc--;
 				else
 					slot_calc = NUM_OF_HPTSI_SLOTS - 1;
 			}
 			inp->inp_hptsslot = slot_calc;
 			if (diag) {
 				diag->inp_hptsslot = inp->inp_hptsslot;
 			}
 		} else {
 			/*
 			 * The hpts is sleeping, we need to figure out where
 			 * it will wake up at and if we need to reschedule
 			 * its time-out.
 			 */
 			uint32_t have_slept, yet_to_sleep;
 			uint32_t slot_now;
 			struct timeval tv;
 
 			ticknow = tcp_gethptstick(&tv);
 			slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
 			/*
 			 * The user wants to be inserted at (slot_now +
 			 * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
 			 */
 			largest_slot = NUM_OF_HPTSI_SLOTS - 2;
 			if (inp->inp_hpts_request > largest_slot) {
 				/* Adjust the residual in inp_hpts_request */
 				slot = largest_slot;
 				inp->inp_hpts_request -= largest_slot;
 			} else {
 				/* No residual it all fits */
 				inp->inp_hpts_request = 0;
 			}
 			inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
 			if (diag) {
 				diag->slot_now = slot_now;
 				diag->inp_hptsslot = inp->inp_hptsslot;
 				diag->p_on_min_sleep = hpts->p_on_min_sleep;
 			}
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
 			}
 			/* Now do we need to restart the hpts's timer? */
 			if (TSTMP_GT(ticknow, hpts->p_curtick))
 				have_slept = ticknow - hpts->p_curtick;
 			else
 				have_slept = 0;
 			if (have_slept < hpts->p_hpts_sleep_time) {
 				/* This should be what happens */
 				yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
 			} else {
 				/* We are over-due */
 				yet_to_sleep = 0;
 				need_wakeup = 1;
 			}
 			if (diag) {
 				diag->have_slept = have_slept;
 				diag->yet_to_sleep = yet_to_sleep;
 				diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
 			}
 			if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
 				/*
 				 * We need to reschedule the hptss time-out.
 				 */
 				hpts->p_hpts_sleep_time = slot;
 				need_new_to = slot * HPTS_TICKS_PER_USEC;
 			}
 		}
 		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
 		if (logging_on) {
 			tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
 		}
 		/*
 		 * Now how far is the hpts sleeping to? if active is 1, its
 		 * up and ticking we do nothing, otherwise we may need to
 		 * reschedule its callout if need_new_to is set from above.
 		 */
 		if (need_wakeup) {
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
 			}
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 			if (diag) {
 				diag->need_new_to = 0;
 				diag->co_ret = 0xffff0000;
 			}
 		} else if (need_new_to) {
 			int32_t co_ret;
 			struct timeval tv;
 			sbintime_t sb;
 
 			tv.tv_sec = 0;
 			tv.tv_usec = 0;
 			while (need_new_to > HPTS_USEC_IN_SEC) {
 				tv.tv_sec++;
 				need_new_to -= HPTS_USEC_IN_SEC;
 			}
 			tv.tv_usec = need_new_to;
 			sb = tvtosbt(tv);
 			if (tcp_hpts_callout_skip_swi == 0) {
 				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 				    hpts_timeout_swi, hpts, hpts->p_cpu,
 				    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 			} else {
 				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 				    hpts_timeout_dir, hpts,
 				    hpts->p_cpu,
 				    C_PREL(tcp_hpts_precision));
 			}
 			if (diag) {
 				diag->need_new_to = need_new_to;
 				diag->co_ret = co_ret;
 			}
 		}
 	} else {
 #ifdef INVARIANTS
 		panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
 #endif
 	}
 }
 
 uint32_t
 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
 	struct tcp_hpts_entry *hpts;
 	uint32_t slot_on, cts;
 	struct timeval tv;
 
 	/*
 	 * We now return the next-slot the hpts will be on, beyond its
 	 * current run (if up) or where it was when it stopped if it is
 	 * sleeping.
 	 */
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	if (in_ts_percision)
 		microuptime(&tv);
 	else
 		getmicrouptime(&tv);
 	cts = tcp_tv_to_usectick(&tv);
 	tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
 	slot_on = hpts->p_nxt_slot;
 	mtx_unlock(&hpts->p_mtx);
 	return (slot_on);
 }
 
 uint32_t
 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
 	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
 }
 
 int
 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
 {
 	int32_t retval = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		retval = 1;
 		if (hpts->p_hpts_active == 0) {
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
 			}
 			retval = 2;
 			hpts->p_direct_wake = 1;
 			tcp_wakeinput(hpts);
 		}
 	} else if (hpts->p_hpts_active == 0) {
 		retval = 4;
 		hpts->p_direct_wake = 1;
 		tcp_wakeinput(hpts);
 	}
 	return (retval);
 }
 
 void
 tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
     int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked)
 {
 	/* Setup packet for input first */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
 	m->m_pkthdr.pace_tlen = (uint16_t) tlen;
 	m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
 	m->m_pkthdr.pace_tos = iptos;
 	m->m_pkthdr.pace_lock = (uint8_t) ti_locked;
 	if (tp->t_in_pkt == NULL) {
 		tp->t_in_pkt = m;
 		tp->t_tail_pkt = m;
 	} else {
 		tp->t_tail_pkt->m_nextpkt = m;
 		tp->t_tail_pkt = m;
 	}
 }
 
 
 int32_t
 __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
     int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line){
 	struct tcp_hpts_entry *hpts;
 	int32_t ret;
 
 	tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos, ti_locked);
 	hpts = tcp_input_lock(tp->t_inpcb);
 	ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
 	mtx_unlock(&hpts->p_mtx);
 	return (ret);
 }
 
 void
 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 	struct tcpcb *tp;
 
 	tp = intotcpcb(inp);
 	hpts = tcp_input_lock(tp->t_inpcb);
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		if (hpts->p_hpts_active == 0) {
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			hpts->p_direct_wake = 1;
 			tcp_wakeinput(hpts);
 		}
 	} else if (hpts->p_hpts_active == 0) {
 		hpts->p_direct_wake = 1;
 		tcp_wakeinput(hpts);
 	}
 	inp->inp_hpts_drop_reas = reason;
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static uint16_t
 hpts_random_cpu(struct inpcb *inp){
 	/*
 	 * No flow type set distribute the load randomly.
 	 */
 	uint16_t cpuid;
 	uint32_t ran;
 
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
 	if (inp->inp_input_cpu_set) {
 		return (inp->inp_input_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* Nothing set use a random number */
 	ran = arc4random();
 	cpuid = (ran & 0xffff) % mp_ncpus;
 	return (cpuid);
 }
 
 static uint16_t
 hpts_cpuid(struct inpcb *inp){
 	uint16_t cpuid;
 
 
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
 	if (inp->inp_input_cpu_set) {
 		return (inp->inp_input_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* If one is set the other must be the same */
 #ifdef	RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid == NETISR_CPUID_NONE)
 		return (hpts_random_cpu(inp));
 	else
 		return (cpuid);
 #else
 	/*
 	 * We don't have a flowid -> cpuid mapping, so cheat and just map
 	 * unknown cpuids to curcpu.  Not the best, but apparently better
 	 * than defaulting to swi 0.
 	 */
 	if (inp->inp_flowtype != M_HASHTYPE_NONE) {
 		cpuid = inp->inp_flowid % mp_ncpus;
 		return (cpuid);
 	}
 	cpuid = hpts_random_cpu(inp);
 	return (cpuid);
 #endif
 }
 
 /*
  * Do NOT try to optimize the processing of inp's
  * by first pulling off all the inp's into a temporary
  * list (e.g. TAILQ_CONCAT). If you do that the subtle
  * interactions of switching CPU's will kill because of
  * problems in the linked list manipulation. Basically
  * you would switch cpu's with the hpts mutex locked
  * but then while you were processing one of the inp's
  * some other one that you switch will get a new
  * packet on the different CPU. It will insert it
  * on the new hptss input list. Creating a temporary
  * link in the inp will not fix it either, since
  * the other hpts will be doing the same thing and
  * you will both end up using the temporary link.
  *
  * You will die in an ASSERT for tailq corruption if you
  * run INVARIANTS or you will die horribly without
  * INVARIANTS in some unknown way with a corrupt linked
  * list.
  */
 static void
 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
 {
 	struct mbuf *m, *n;
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	uint16_t drop_reason;
 	int16_t set_cpu;
 	uint32_t did_prefetch = 0;
 	int32_t ti_locked = TI_UNLOCKED;
 
 	HPTS_MTX_ASSERT(hpts);
 	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
 		HPTS_MTX_ASSERT(hpts);
 		hpts_sane_input_remove(hpts, inp, 0);
 		if (inp->inp_input_cpu_set == 0) {
 			set_cpu = 1;
 		} else {
 			set_cpu = 0;
 		}
 		hpts->p_inp = inp;
 		drop_reason = inp->inp_hpts_drop_reas;
 		inp->inp_in_input = 0;
 		tp = intotcpcb(inp);
 		mtx_unlock(&hpts->p_mtx);
 		CURVNET_SET(inp->inp_vnet);
 		if (drop_reason) {
 			INP_INFO_RLOCK(&V_tcbinfo);
 			ti_locked = TI_RLOCKED;
 		} else {
 			ti_locked = TI_UNLOCKED;
 		}
 		INP_WLOCK(inp);
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 		    (inp->inp_flags2 & INP_FREED)) {
 out:
 			hpts->p_inp = NULL;
 			if (ti_locked == TI_RLOCKED) {
 				INP_INFO_RUNLOCK(&V_tcbinfo);
 			}
 			if (in_pcbrele_wlocked(inp) == 0) {
 				INP_WUNLOCK(inp);
 			}
 			ti_locked = TI_UNLOCKED;
 			CURVNET_RESTORE();
 			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
 		if ((tp == NULL) || (tp->t_inpcb == NULL)) {
 			goto out;
 		}
 		if (drop_reason) {
 			/* This tcb is being destroyed for drop_reason */
 			m = tp->t_in_pkt;
 			if (m)
 				n = m->m_nextpkt;
 			else
 				n = NULL;
 			tp->t_in_pkt = NULL;
 			while (m) {
 				m_freem(m);
 				m = n;
 				if (m)
 					n = m->m_nextpkt;
 			}
 			tp = tcp_drop(tp, drop_reason);
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 			if (tp == NULL) {
 				INP_WLOCK(inp);
 			}
 			if (in_pcbrele_wlocked(inp) == 0)
 				INP_WUNLOCK(inp);
 			CURVNET_RESTORE();
 			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
 		if (set_cpu) {
 			/*
 			 * Setup so the next time we will move to the right
 			 * CPU. This should be a rare event. It will
 			 * sometimes happens when we are the client side
 			 * (usually not the server). Somehow tcp_output()
 			 * gets called before the tcp_do_segment() sets the
 			 * intial state. This means the r_cpu and r_hpts_cpu
 			 * is 0. We get on the hpts, and then tcp_input()
 			 * gets called setting up the r_cpu to the correct
 			 * value. The hpts goes off and sees the mis-match.
 			 * We simply correct it here and the CPU will switch
 			 * to the new hpts nextime the tcb gets added to the
 			 * the hpts (not this time) :-)
 			 */
 			tcp_set_hpts(inp);
 		}
 		m = tp->t_in_pkt;
 		n = NULL;
 		if (m != NULL &&
 		    (m->m_pkthdr.pace_lock == TI_RLOCKED ||
 		    tp->t_state != TCPS_ESTABLISHED)) {
 			ti_locked = TI_RLOCKED;
-			if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
-				CURVNET_RESTORE();
-				goto out;
-			}
+			INP_INFO_RLOCK(&V_tcbinfo);
 			m = tp->t_in_pkt;
 		}
 		if (in_newts_every_tcb) {
 			if (in_ts_percision)
 				microuptime(tv);
 			else
 				getmicrouptime(tv);
 		}
 		if (tp->t_fb_ptr != NULL) {
 			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 			did_prefetch = 1;
 		}
 		/* Any input work to do, if so do it first */
 		if ((m != NULL) && (m == tp->t_in_pkt)) {
 			struct tcphdr *th;
 			int32_t tlen, drop_hdrlen, nxt_pkt;
 			uint8_t iptos;
 
 			n = m->m_nextpkt;
 			tp->t_in_pkt = tp->t_tail_pkt = NULL;
 			while (m) {
 				th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
 				tlen = m->m_pkthdr.pace_tlen;
 				drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
 				iptos = m->m_pkthdr.pace_tos;
 				m->m_nextpkt = NULL;
 				if (n)
 					nxt_pkt = 1;
 				else
 					nxt_pkt = 0;
 				inp->inp_input_calls = 1;
 				if (tp->t_fb->tfb_tcp_hpts_do_segment) {
 					/* Use the hpts specific do_segment */
 					(*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
 					    tp, drop_hdrlen,
 					    tlen, iptos, ti_locked, nxt_pkt, tv);
 				} else {
 					/* Use the default do_segment */
 					(*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
 					    tp, drop_hdrlen,
 					    tlen, iptos, ti_locked);
 				}
 				/*
 				 * Do segment returns unlocked we need the
 				 * lock again but we also need some kasserts
 				 * here.
 				 */
 				INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 				INP_UNLOCK_ASSERT(inp);
 				m = n;
 				if (m)
 					n = m->m_nextpkt;
 				if (m != NULL &&
 				    m->m_pkthdr.pace_lock == TI_RLOCKED) {
 					INP_INFO_RLOCK(&V_tcbinfo);
 					ti_locked = TI_RLOCKED;
 				} else
 					ti_locked = TI_UNLOCKED;
 				INP_WLOCK(inp);
 				/*
 				 * Since we have an opening here we must
 				 * re-check if the tcb went away while we
 				 * were getting the lock(s).
 				 */
 				if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 				    (inp->inp_flags2 & INP_FREED)) {
-			out_free:
 					while (m) {
 						m_freem(m);
 						m = n;
 						if (m)
 							n = m->m_nextpkt;
 					}
 					goto out;
 				}
 				/*
 				 * Now that we hold the INP lock, check if
 				 * we need to upgrade our lock.
 				 */
 				if (ti_locked == TI_UNLOCKED &&
 				    (tp->t_state != TCPS_ESTABLISHED)) {
 					ti_locked = TI_RLOCKED;
-					if (tcp_hptsi_lock_inpinfo(inp, &tp))
-						goto out_free;
+					INP_INFO_RLOCK(&V_tcbinfo);
 				}
 			}	/** end while(m) */
 		}		/** end if ((m != NULL)  && (m == tp->t_in_pkt)) */
 		if (in_pcbrele_wlocked(inp) == 0)
 			INP_WUNLOCK(inp);
 		if (ti_locked == TI_RLOCKED)
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		INP_UNLOCK_ASSERT(inp);
 		ti_locked = TI_UNLOCKED;
 		mtx_lock(&hpts->p_mtx);
 		hpts->p_inp = NULL;
 		CURVNET_RESTORE();
 	}
 }
 
 static int
 tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
 {
 	int32_t ticks_to_run;
 
 	if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
 		ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
 		if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
 			ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
 		}
 	} else {
 		if (hpts->p_prevtick == hpts->p_curtick) {
 			/* This happens when we get woken up right away */
 			return (-1);
 		}
 		ticks_to_run = 1;
 	}
 	/* Set in where we will be when we catch up */
 	hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
 	if (hpts->p_nxt_slot == hpts->p_cur_slot) {
 		panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
 		    hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
 	}
 	return (ticks_to_run);
 }
 
 static void
 tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp = NULL, *ninp;
 	struct timeval tv;
 	int32_t ticks_to_run, i, error, tick_now, interum_tick;
 	int32_t paced_cnt = 0;
 	int32_t did_prefetch = 0;
 	int32_t prefetch_ninp = 0;
 	int32_t prefetch_tp = 0;
 	uint32_t cts;
 	int16_t set_cpu;
 
 	HPTS_MTX_ASSERT(hpts);
 	hpts->p_curtick = tcp_tv_to_hptstick(ctick);
 	cts = tcp_tv_to_usectick(ctick);
 	memcpy(&tv, ctick, sizeof(struct timeval));
 	hpts->p_cur_slot = hpts_tick(hpts, 1);
 
 	/* Figure out if we had missed ticks */
 again:
 	HPTS_MTX_ASSERT(hpts);
 	ticks_to_run = tcp_hpts_est_run(hpts);
 	if (!TAILQ_EMPTY(&hpts->p_input)) {
 		tcp_input_data(hpts, &tv);
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		panic("tp:%p in_hpts input empty but cnt:%d",
 		    hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	HPTS_MTX_ASSERT(hpts);
 	/* Reset the ticks to run and time if we need too */
 	interum_tick = tcp_gethptstick(&tv);
 	if (interum_tick != hpts->p_curtick) {
 		/* Save off the new time we execute to */
 		*ctick = tv;
 		hpts->p_curtick = interum_tick;
 		cts = tcp_tv_to_usectick(&tv);
 		hpts->p_cur_slot = hpts_tick(hpts, 1);
 		ticks_to_run = tcp_hpts_est_run(hpts);
 	}
 	if (ticks_to_run == -1) {
 		goto no_run;
 	}
 	if (logging_on) {
 		tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
 	}
 	if (hpts->p_on_queue_cnt == 0) {
 		goto no_one;
 	}
 	HPTS_MTX_ASSERT(hpts);
 	for (i = 0; i < ticks_to_run; i++) {
 		/*
 		 * Calculate our delay, if there are no extra ticks there
 		 * was not any
 		 */
 		hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
 		HPTS_MTX_ASSERT(hpts);
 		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
 			/* For debugging */
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
 			}
 			hpts->p_inp = inp;
 			paced_cnt++;
 			if (hpts->p_cur_slot != inp->inp_hptsslot) {
 				panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
 				    hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
 			}
 			/* Now pull it */
 			if (inp->inp_hpts_cpu_set == 0) {
 				set_cpu = 1;
 			} else {
 				set_cpu = 0;
 			}
 			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
 			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
 				/* We prefetch the next inp if possible */
 				kern_prefetch(ninp, &prefetch_ninp);
 				prefetch_ninp = 1;
 			}
 			if (inp->inp_hpts_request) {
 				/*
 				 * This guy is deferred out further in time
 				 * then our wheel had on it. Push him back
 				 * on the wheel.
 				 */
 				int32_t remaining_slots;
 
 				remaining_slots = ticks_to_run - (i + 1);
 				if (inp->inp_hpts_request > remaining_slots) {
 					/*
 					 * Keep INVARIANTS happy by clearing
 					 * the flag
 					 */
 					tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
 					hpts->p_inp = NULL;
 					continue;
 				}
 				inp->inp_hpts_request = 0;
 			}
 			/*
 			 * We clear the hpts flag here after dealing with
 			 * remaining slots. This way anyone looking with the
 			 * TCB lock will see its on the hpts until just
 			 * before we unlock.
 			 */
 			inp->inp_in_hpts = 0;
 			mtx_unlock(&hpts->p_mtx);
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp)) {
 				mtx_lock(&hpts->p_mtx);
 				if (logging_on)
 					tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
 				hpts->p_inp = NULL;
 				continue;
 			}
 			if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 out_now:
 #ifdef INVARIANTS
 				if (mtx_owned(&hpts->p_mtx)) {
 					panic("Hpts:%p owns mtx prior-to lock line:%d",
 					    hpts, __LINE__);
 				}
 #endif
 				INP_WUNLOCK(inp);
 				mtx_lock(&hpts->p_mtx);
 				if (logging_on)
 					tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
 				hpts->p_inp = NULL;
 				continue;
 			}
 			tp = intotcpcb(inp);
 			if ((tp == NULL) || (tp->t_inpcb == NULL)) {
 				goto out_now;
 			}
 			if (set_cpu) {
 				/*
 				 * Setup so the next time we will move to
 				 * the right CPU. This should be a rare
 				 * event. It will sometimes happens when we
 				 * are the client side (usually not the
 				 * server). Somehow tcp_output() gets called
 				 * before the tcp_do_segment() sets the
 				 * intial state. This means the r_cpu and
 				 * r_hpts_cpu is 0. We get on the hpts, and
 				 * then tcp_input() gets called setting up
 				 * the r_cpu to the correct value. The hpts
 				 * goes off and sees the mis-match. We
 				 * simply correct it here and the CPU will
 				 * switch to the new hpts nextime the tcb
 				 * gets added to the the hpts (not this one)
 				 * :-)
 				 */
 				tcp_set_hpts(inp);
 			}
 			if (out_newts_every_tcb) {
 				struct timeval sv;
 
 				if (out_ts_percision)
 					microuptime(&sv);
 				else
 					getmicrouptime(&sv);
 				cts = tcp_tv_to_usectick(&sv);
 			}
 			CURVNET_SET(inp->inp_vnet);
 			/*
 			 * There is a hole here, we get the refcnt on the
 			 * inp so it will still be preserved but to make
 			 * sure we can get the INP we need to hold the p_mtx
 			 * above while we pull out the tp/inp,  as long as
 			 * fini gets the lock first we are assured of having
 			 * a sane INP we can lock and test.
 			 */
 #ifdef INVARIANTS
 			if (mtx_owned(&hpts->p_mtx)) {
 				panic("Hpts:%p owns mtx before tcp-output:%d",
 				    hpts, __LINE__);
 			}
 #endif
 			if (tp->t_fb_ptr != NULL) {
 				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 				did_prefetch = 1;
 			}
 			inp->inp_hpts_calls = 1;
 			if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
 				error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
 			} else {
 				error = tp->t_fb->tfb_tcp_output(tp);
 			}
 			if (ninp && ninp->inp_ppcb) {
 				/*
 				 * If we have a nxt inp, see if we can
 				 * prefetch its ppcb. Note this may seem
 				 * "risky" since we have no locks (other
 				 * than the previous inp) and there no
 				 * assurance that ninp was not pulled while
 				 * we were processing inp and freed. If this
 				 * occured it could mean that either:
 				 *
 				 * a) Its NULL (which is fine we won't go
 				 * here) <or> b) Its valid (which is cool we
 				 * will prefetch it) <or> c) The inp got
 				 * freed back to the slab which was
 				 * reallocated. Then the piece of memory was
 				 * re-used and something else (not an
 				 * address) is in inp_ppcb. If that occurs
 				 * we don't crash, but take a TLB shootdown
 				 * performance hit (same as if it was NULL
 				 * and we tried to pre-fetch it).
 				 *
 				 * Considering that the likelyhood of <c> is
 				 * quite rare we will take a risk on doing
 				 * this. If performance drops after testing
 				 * we can always take this out. NB: the
 				 * kern_prefetch on amd64 actually has
 				 * protection against a bad address now via
 				 * the DMAP_() tests. This will prevent the
 				 * TLB hit, and instead if <c> occurs just
 				 * cause us to load cache with a useless
 				 * address (to us).
 				 */
 				kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
 				prefetch_tp = 1;
 			}
 			INP_WUNLOCK(inp);
 			INP_UNLOCK_ASSERT(inp);
 			CURVNET_RESTORE();
 #ifdef INVARIANTS
 			if (mtx_owned(&hpts->p_mtx)) {
 				panic("Hpts:%p owns mtx prior-to lock line:%d",
 				    hpts, __LINE__);
 			}
 #endif
 			mtx_lock(&hpts->p_mtx);
 			if (logging_on)
 				tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
 			hpts->p_inp = NULL;
 		}
 		HPTS_MTX_ASSERT(hpts);
 		hpts->p_inp = NULL;
 		hpts->p_cur_slot++;
 		if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
 			hpts->p_cur_slot = 0;
 		}
 	}
 no_one:
 	HPTS_MTX_ASSERT(hpts);
 	hpts->p_prevtick = hpts->p_curtick;
 	hpts->p_delayed_by = 0;
 	/*
 	 * Check to see if we took an excess amount of time and need to run
 	 * more ticks (if we did not hit eno-bufs).
 	 */
 	/* Re-run any input that may be there */
 	(void)tcp_gethptstick(&tv);
 	if (!TAILQ_EMPTY(&hpts->p_input)) {
 		tcp_input_data(hpts, &tv);
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		panic("tp:%p in_hpts input empty but cnt:%d",
 		    hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	tick_now = tcp_gethptstick(&tv);
 	if (SEQ_GT(tick_now, hpts->p_prevtick)) {
 		struct timeval res;
 
 		/* Did we really spend a full tick or more in here? */
 		timersub(&tv, ctick, &res);
 		if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
 			counter_u64_add(hpts_loops, 1);
 			if (logging_on) {
 				tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
 			}
 			*ctick = res;
 			hpts->p_curtick = tick_now;
 			goto again;
 		}
 	}
 no_run:
 	{
 		uint32_t t = 0, i, fnd = 0;
 
 		if (hpts->p_on_queue_cnt) {
 
 
 			/*
 			 * Find next slot that is occupied and use that to
 			 * be the sleep time.
 			 */
 			for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
 				if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
 					fnd = 1;
 					break;
 				}
 				t = (t + 1) % NUM_OF_HPTSI_SLOTS;
 			}
 			if (fnd) {
 				hpts->p_hpts_sleep_time = i;
 			} else {
 				counter_u64_add(back_tosleep, 1);
 #ifdef INVARIANTS
 				panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
 #endif
 				hpts->p_on_queue_cnt = 0;
 				goto non_found;
 			}
 			t++;
 		} else {
 			/* No one on the wheel sleep for all but 2 slots  */
 non_found:
 			if (hpts_sleep_max == 0)
 				hpts_sleep_max = 1;
 			hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
 			t = 0;
 		}
 		if (logging_on) {
 			tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
 		}
 	}
 }
 
 void
 __tcp_set_hpts(struct inpcb *inp, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	if ((inp->inp_in_hpts == 0) &&
 	    (inp->inp_hpts_cpu_set == 0)) {
 		inp->inp_hpts_cpu = hpts_cpuid(inp);
 		inp->inp_hpts_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 	hpts = tcp_input_lock(inp);
 	if ((inp->inp_input_cpu_set == 0) &&
 	    (inp->inp_in_input == 0)) {
 		inp->inp_input_cpu = hpts_cpuid(inp);
 		inp->inp_input_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
 
 uint16_t
 tcp_hpts_delayedby(struct inpcb *inp){
 	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
 }
 
 static void
 tcp_hpts_thread(void *ctx)
 {
 	struct tcp_hpts_entry *hpts;
 	struct timeval tv;
 	sbintime_t sb;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
 	mtx_lock(&hpts->p_mtx);
 	if (hpts->p_direct_wake) {
 		/* Signaled by input */
 		if (logging_on)
 			tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
 		callout_stop(&hpts->co);
 	} else {
 		/* Timed out */
 		if (callout_pending(&hpts->co) ||
 		    !callout_active(&hpts->co)) {
 			if (logging_on)
 				tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
 			mtx_unlock(&hpts->p_mtx);
 			return;
 		}
 		callout_deactivate(&hpts->co);
 		if (logging_on)
 			tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
 	}
 	hpts->p_hpts_active = 1;
 	(void)tcp_gethptstick(&tv);
 	tcp_hptsi(hpts, &tv);
 	HPTS_MTX_ASSERT(hpts);
 	tv.tv_sec = 0;
 	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
 	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
 		tv.tv_usec = tcp_min_hptsi_time;
 		hpts->p_on_min_sleep = 1;
 	} else {
 		/* Clear the min sleep flag */
 		hpts->p_on_min_sleep = 0;
 	}
 	hpts->p_hpts_active = 0;
 	sb = tvtosbt(tv);
 	if (tcp_hpts_callout_skip_swi == 0) {
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 		    hpts_timeout_swi, hpts, hpts->p_cpu,
 		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	} else {
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 		    hpts_timeout_dir, hpts,
 		    hpts->p_cpu,
 		    C_PREL(tcp_hpts_precision));
 	}
 	hpts->p_direct_wake = 0;
 	mtx_unlock(&hpts->p_mtx);
 }
 
 #undef	timersub
 
 static void
 tcp_init_hptsi(void *st)
 {
 	int32_t i, j, error, bound = 0, created = 0;
 	size_t sz, asz;
 	struct timeval tv;
 	sbintime_t sb;
 	struct tcp_hpts_entry *hpts;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
 
 	tcp_pace.rp_proc = NULL;
 	tcp_pace.rp_num_hptss = ncpus;
 	hpts_loops = counter_u64_alloc(M_WAITOK);
 	back_tosleep = counter_u64_alloc(M_WAITOK);
 
 	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
 	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
 		    M_TCPHPTS, M_WAITOK | M_ZERO);
 		tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
 		    M_TCPHPTS, M_WAITOK);
 		hpts = tcp_pace.rp_ent[i];
 		/*
 		 * Init all the hpts structures that are not specifically
 		 * zero'd by the allocations. Also lets attach them to the
 		 * appropriate sysctl block as well.
 		 */
 		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
 		    "hpts", MTX_DEF | MTX_DUPOK);
 		TAILQ_INIT(&hpts->p_input);
 		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
 			TAILQ_INIT(&hpts->p_hptss[j]);
 		}
 		sysctl_ctx_init(&hpts->hpts_ctx);
 		sprintf(unit, "%d", i);
 		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
 		    OID_AUTO,
 		    unit,
 		    CTLFLAG_RW, 0,
 		    "");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "in_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_inqueue_cnt, 0,
 		    "Count TCB's awaiting input processing");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_queue_cnt, 0,
 		    "Count TCB's awaiting output processing");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "active", CTLFLAG_RD,
 		    &hpts->p_hpts_active, 0,
 		    "Is the hpts active");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curslot", CTLFLAG_RD,
 		    &hpts->p_cur_slot, 0,
 		    "What the current slot is if active");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curtick", CTLFLAG_RD,
 		    &hpts->p_curtick, 0,
 		    "What the current tick on if active");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "logsize", CTLFLAG_RD,
 		    &hpts->p_logsize, 0,
 		    "Hpts logging buffer size");
 		hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
 		hpts->p_num = i;
 		hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
 		hpts->p_prevtick -= 1;
 		hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
 		hpts->p_cpu = 0xffff;
 		hpts->p_nxt_slot = 1;
 		hpts->p_logsize = tcp_hpts_logging_size;
 		if (hpts->p_logsize) {
 			sz = (sizeof(struct hpts_log) * hpts->p_logsize);
 			hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
 		}
 		callout_init(&hpts->co, 1);
 	}
 	/*
 	 * Now lets start ithreads to handle the hptss.
 	 */
 	CPU_FOREACH(i) {
 		hpts = tcp_pace.rp_ent[i];
 		hpts->p_cpu = i;
 		error = swi_add(&hpts->ie, "hpts",
 		    tcp_hpts_thread, (void *)hpts,
 		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
 		if (error) {
 			panic("Can't add hpts:%p i:%d err:%d",
 			    hpts, i, error);
 		}
 		created++;
 		if (tcp_bind_threads) {
 			if (intr_event_bind(hpts->ie, i) == 0)
 				bound++;
 		}
 		tv.tv_sec = 0;
 		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
 		sb = tvtosbt(tv);
 		if (tcp_hpts_callout_skip_swi == 0) {
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 			    hpts_timeout_swi, hpts, hpts->p_cpu,
 			    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		} else {
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 			    hpts_timeout_dir, hpts,
 			    hpts->p_cpu,
 			    C_PREL(tcp_hpts_precision));
 		}
 	}
 	printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
 	    created, bound);
 	return;
 }
 
 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
 MODULE_VERSION(tcphpts, 1);
Index: head/sys/netinet/tcp_input.c
===================================================================
--- head/sys/netinet/tcp_input.c	(revision 335355)
+++ head/sys/netinet/tcp_input.c	(revision 335356)
@@ -1,3895 +1,3865 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2007-2008,2010
  *	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
  * James Healy and David Hayes, made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #include <netinet/tcp_syncache.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 const int tcprexmtthresh = 3;
 
 int tcp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &tcp_log_in_vain, 0,
     "Log all incoming TCP segments to closed ports");
 
 VNET_DEFINE(int, blackhole) = 0;
 #define	V_blackhole		VNET(blackhole)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(blackhole), 0,
     "Do not send RST on segments to closed ports");
 
 VNET_DEFINE(int, tcp_delack_enabled) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_delack_enabled), 0,
     "Delay ACK to try and piggyback it onto a data packet");
 
 VNET_DEFINE(int, drop_synfin) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
 VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc6675_pipe), 0,
     "Use calculated pipe/in-flight bytes per RFC 6675");
 
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3042), 0,
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 VNET_DEFINE(int, tcp_initcwnd_segments) = 10;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0,
     "Slow-start flight size (initial congestion window) in number of segments");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 
 VNET_DEFINE(int, tcp_do_ecn) = 2;
 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_ecn), 0,
     "TCP ECN support");
 
 VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_ecn_maxretries), 0,
     "Max retries before giving up on ECN");
 
 VNET_DEFINE(int, tcp_insecure_syn) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_syn), 0,
     "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
 
 VNET_DEFINE(int, tcp_insecure_rst) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_rst), 0,
     "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
 
 VNET_DEFINE(int, tcp_recvspace) = 1024*64;
 #define	V_tcp_recvspace	VNET(tcp_recvspace)
 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_autorcvbuf), 0,
     "Enable automatic receive buffer sizing");
 
 VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_inc), 0,
     "Incrementor step size of automatic receive buffer");
 
 VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_max), 0,
     "Max size of automatic receive buffer");
 
 VNET_DEFINE(struct inpcbhead, tcb);
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
 /*
  * TCP statistics are stored in an array of counter(9)s, which size matches
  * size of struct tcpstat.  TCP running connection count is a regular array.
  */
 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
     tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]);
 SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
     CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
     "TCP connection counts by TCP state");
 
 static void
 tcp_vnet_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
 	VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
 }
 VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     tcp_vnet_init, NULL);
 
 #ifdef VIMAGE
 static void
 tcp_vnet_uninit(const void *unused)
 {
 
 	COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
 	VNET_PCPUSTAT_FREE(tcpstat);
 }
 VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     tcp_vnet_uninit, NULL);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating tcpstat.  The argument is an index
  * into tcpstat treated as an array.
  */
 void
 kmod_tcpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(tcpstat)[statnum], 1);
 }
 
 #ifdef TCP_HHOOK
 /*
  * Wrapper for the TCP established input helper hook.
  */
 void
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data,
 		    tp->osd);
 	}
 }
 #endif
 
 /*
  * CC wrapper hook functions
  */
 void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
     uint16_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tp->ccv->nsegs = nsegs;
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if (tp->snd_cwnd <= tp->snd_wnd)
 		tp->ccv->flags |= CCF_CWND_LIMITED;
 	else
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			     nsegs * V_tcp_abc_l_var * tcp_maxseg(tp));
 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
 				tp->t_bytes_acked -= tp->snd_cwnd;
 				tp->ccv->flags |= CCF_ABC_SENTAWND;
 			}
 		} else {
 				tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 				tp->t_bytes_acked = 0;
 		}
 	}
 
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
 }
 
 void 
 cc_conn_init(struct tcpcb *tp)
 {
 	struct hc_metrics_lite metrics;
 	struct inpcb *inp = tp->t_inpcb;
 	u_int maxseg;
 	int rtt;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	maxseg = tcp_maxseg(tp);
 
 	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 		tp->t_srtt = rtt;
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		TCPSTAT_INC(tcps_usedrtt);
 		if (metrics.rmx_rttvar) {
 			tp->t_rttvar = metrics.rmx_rttvar;
 			TCPSTAT_INC(tcps_usedrttvar);
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 		    tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	if (metrics.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh);
 		TCPSTAT_INC(tcps_usedssthresh);
 	}
 
 	/*
 	 * Set the initial slow-start flight size.
 	 *
 	 * RFC5681 Section 3.1 specifies the default conservative values.
 	 * RFC3390 specifies slightly more aggressive values.
 	 * RFC6928 increases it to ten segments.
 	 * Support for user specified value for initial flight size.
 	 *
 	 * If a SYN or SYN/ACK was lost and retransmitted, we have to
 	 * reduce the initial CWND to one segment as congestion is likely
 	 * requiring us to be cautious.
 	 */
 	if (tp->snd_cwnd == 1)
 		tp->snd_cwnd = maxseg;		/* SYN(-ACK) lost */
 	else if (V_tcp_initcwnd_segments)
 		tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg,
 		    max(2 * maxseg, V_tcp_initcwnd_segments * 1460));
 	else if (V_tcp_do_rfc3390)
 		tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (maxseg > 2190)
 			tp->snd_cwnd = 2 * maxseg;
 		else if (maxseg > 1095)
 			tp->snd_cwnd = 3 * maxseg;
 		else
 			tp->snd_cwnd = 4 * maxseg;
 	}
 
 	if (CC_ALGO(tp)->conn_init != NULL)
 		CC_ALGO(tp)->conn_init(tp->ccv);
 }
 
 void inline
 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 {
 	u_int maxseg;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	switch(type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags)) {
 			TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		maxseg = tcp_maxseg(tp);
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 		    maxseg) * maxseg;
 		tp->snd_cwnd = maxseg;
 		break;
 	case CC_RTO_ERR:
 		TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY)
 			ENTER_FASTRECOVERY(tp->t_flags);
 		if (tp->t_flags & TF_WASCRECOVERY)
 			ENTER_CONGRECOVERY(tp->t_flags);
 		tp->snd_nxt = tp->snd_max;
 		tp->t_flags &= ~TF_PREVVALID;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
 	}
 }
 
 void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* XXXLAS: KASSERT that we're in recovery? */
 
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 	}
 	/* XXXLAS: EXIT_RECOVERY ? */
 	tp->t_bytes_acked = 0;
 }
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  */
 #define DELAY_ACK(tp, tlen)						\
 	((!tcp_timer_active(tp, TT_DELACK) &&				\
 	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
 	    (tlen <= tp->t_maxseg) &&					\
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
 
 static void inline
 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 		    tp->ccv->flags |= CCF_IPHDR_CE;
 		    break;
 		case IPTOS_ECN_ECT0:
 		    tp->ccv->flags &= ~CCF_IPHDR_CE;
 		    break;
 		case IPTOS_ECN_ECT1:
 		    tp->ccv->flags &= ~CCF_IPHDR_CE;
 		    break;
 		}
 
 		if (th->th_flags & TH_CWR)
 			tp->ccv->flags |= CCF_TCPHDR_CWR;
 		else
 			tp->ccv->flags &= ~CCF_TCPHDR_CWR;
 
 		if (tp->t_flags & TF_DELACK)
 			tp->ccv->flags |= CCF_DELACK;
 		else
 			tp->ccv->flags &= ~CCF_DELACK;
 
 		CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
 
 		if (tp->ccv->flags & CCF_ACKNOW)
 			tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 }
 
 /*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
  *   tcp_input handles primary segment validation, inpcb lookup and
  *	SYN processing on listen sockets
  *   tcp_do_segment processes the ACK and text of the segment for
  *	establishing, established and closing connections
  */
 #ifdef INET6
 int
 tcp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 	struct ip6_hdr *ip6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
 		struct ip6_hdr *ip6;
 
 		ifa_free(&ia6->ia_ifa);
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return (IPPROTO_DONE);
 	}
 	if (ia6)
 		ifa_free(&ia6->ia_ifa);
 
 	return (tcp_input(mp, offp, proto));
 }
 #endif /* INET6 */
 
 int
 tcp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct tcphdr *th = NULL;
 	struct ip *ip = NULL;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	u_char *optp = NULL;
 	int off0;
 	int optlen = 0;
 #ifdef INET
 	int len;
 #endif
 	int tlen = 0, off;
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 	uint8_t iptos;
 	struct m_tag *fwd_tag = NULL;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 #else
 	const void *ip6 = NULL;
 #endif /* INET6 */
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 	int ti_locked;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 
 	off0 = *offp;
 	m = *mp;
 	*mp = NULL;
 	to.to_flags = 0;
 	TCPSTAT_INC(tcps_rcvtotal);
 
 #ifdef INET6
 	if (isipv6) {
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
 
 		if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
 			m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
 			if (m == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return (IPPROTO_DONE);
 			}
 		}
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in6_cksum_pseudo(ip6, tlen,
 				    IPPROTO_TCP, m->m_pkthdr.csum_data);
 			th->th_sum ^= 0xffff;
 		} else
 			th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 			    == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return (IPPROTO_DONE);
 			}
 		}
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ntohs(ip->ip_len) - off0;
 
 		iptos = ip->ip_tos;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr,
 				    htonl(m->m_pkthdr.csum_data + tlen +
 				    IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 		} else {
 			struct ipovly *ipov = (struct ipovly *)ip;
 
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = off0 + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = htons(tlen);
 			th->th_sum = in_cksum(m, len);
 			/* Reset length for SDT probes. */
 			ip->ip_len = htons(len);
 			/* Reset TOS bits */
 			ip->ip_tos = iptos;
 			/* Re-initialization for later version check */
 			ip->ip_v = IPVERSION;
 			ip->ip_hl = off0 >> 2;
 		}
 
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 	}
 #endif /* INET */
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		TCPSTAT_INC(tcps_rcvbadoff);
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 #ifdef INET6
 		if (isipv6) {
 			IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE);
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 				    == NULL) {
 					TCPSTAT_INC(tcps_rcvshort);
 					return (IPPROTO_DONE);
 				}
 				ip = mtod(m, struct ip *);
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 #endif
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	tcp_fields_to_host(th);
 
 	/*
 	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment; if we're likely to add or remove a
 	 * connection then first acquire pcbinfo lock.  There are three cases
 	 * where we might discover later we need a write lock despite the
 	 * flags: ACKs moving a connection out of the syncache, ACKs for a
 	 * connection in TIMEWAIT and SYNs not targeting a listening socket.
 	 */
 	if ((thflags & (TH_FIN | TH_RST)) != 0) {
 		INP_INFO_RLOCK(&V_tcbinfo);
 		ti_locked = TI_RLOCKED;
 	} else
 		ti_locked = TI_UNLOCKED;
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
         if (
 #ifdef INET6
 	    (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
 #ifdef INET
 	    || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
 #endif
 #endif
 #if defined(INET) && !defined(INET6)
 	    (m->m_flags & M_IP_NEXTHOP)
 #endif
 	    )
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 findpcb:
 #ifdef INVARIANTS
 	if (ti_locked == TI_RLOCKED) {
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	} else {
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 #ifdef INET6
 	if (isipv6 && fwd_tag != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(&V_tcbinfo,
 		    &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
 			    th->th_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
 			    th->th_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
 		}
 	} else if (isipv6) {
 		inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
 		    th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	if (fwd_tag != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag+1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
 		    ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
 			    th->th_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? ntohs(next_hop->sin_port) :
 			    th->th_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
 		}
 	} else
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
 		    th->th_sport, ip->ip_dst, th->th_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 #endif /* INET */
 
 	/*
 	 * If the INPCB does not exist then all data in the incoming
 	 * segment is discarded and an appropriate RST is sent back.
 	 * XXX MRT Send RST using which routing table?
 	 */
 	if (inp == NULL) {
 		/*
 		 * Log communication attempts to ports that are not
 		 * in use.
 		 */
 		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
 		    tcp_log_in_vain == 2) {
 			if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
 		/*
 		 * When blackholing do not respond with a RST but
 		 * completely ignore the segment and drop it.
 		 */
 		if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
 		    V_blackhole == 2)
 			goto dropunlock;
 
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_WLOCK_ASSERT(inp);
 	/*
 	 * While waiting for inp lock during the lookup, another thread
 	 * can have dropped the inpcb, in which case we need to loop back
 	 * and try to find a new inpcb to deliver to.
 	 */
 	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		inp = NULL;
 		goto findpcb;
 	}
 	if ((inp->inp_flowtype == M_HASHTYPE_NONE) &&
 	    (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) &&
 	    ((inp->inp_socket == NULL) ||
 	    (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) {
 		inp->inp_flowid = m->m_pkthdr.flowid;
 		inp->inp_flowtype = M_HASHTYPE_GET(m);
 	}
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6) &&
 	    IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
 		goto dropunlock;
 	}
 #ifdef INET
 	else
 #endif
 #endif /* INET6 */
 #ifdef INET
 	if (IPSEC_ENABLED(ipv4) &&
 	    IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) {
 		goto dropunlock;
 	}
 #endif /* INET */
 #endif /* IPSEC */
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	if (inp->inp_ip_minttl != 0) {
 #ifdef INET6
 		if (isipv6) {
 			if (inp->inp_ip_minttl > ip6->ip6_hlim)
 				goto dropunlock;
 		} else
 #endif
 		if (inp->inp_ip_minttl > ip->ip_ttl)
 			goto dropunlock;
 	}
 
 	/*
 	 * A previous connection in TIMEWAIT state is supposed to catch stray
 	 * or duplicate segments arriving late.  If this segment was a
 	 * legitimate new connection attempt, the old INPCB gets removed and
 	 * we can try again to find a listening socket.
 	 *
 	 * At this point, due to earlier optimism, we may hold only an inpcb
 	 * lock, and not the inpcbinfo write lock.  If so, we need to try to
 	 * acquire it, or if that fails, acquire a reference on the inpcb,
 	 * drop all locks, acquire a global write lock, and then re-acquire
 	 * the inpcb lock.  We may at that point discover that another thread
 	 * has tried to free the inpcb, in which case we need to loop back
 	 * and try to find a new inpcb to deliver to.
 	 *
 	 * XXXRW: It may be time to rethink timewait locking.
 	 */
-relocked:
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		if (ti_locked == TI_UNLOCKED) {
-			if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
-				in_pcbref(inp);
-				INP_WUNLOCK(inp);
-				INP_INFO_RLOCK(&V_tcbinfo);
-				ti_locked = TI_RLOCKED;
-				INP_WLOCK(inp);
-				if (in_pcbrele_wlocked(inp)) {
-					inp = NULL;
-					goto findpcb;
-				} else if (inp->inp_flags & INP_DROPPED) {
-					INP_WUNLOCK(inp);
-					inp = NULL;
-					goto findpcb;
-				}
-			} else
-				ti_locked = TI_RLOCKED;
+			INP_INFO_RLOCK();
+			ti_locked = TI_RLOCKED;
 		}
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
 		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		return (IPPROTO_DONE);
 	}
 	/*
 	 * The TCPCB may no longer exist if the connection is winding
 	 * down or it is in the CLOSED state.  Either way we drop the
 	 * segment and send an appropriate response.
 	 */
 	tp = intotcpcb(inp);
 	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		tcp_offload_input(tp, m);
 		m = NULL;	/* consumed by the TOE driver */
 		goto dropunlock;
 	}
 #endif
 
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock but don't hold it.  In this case, attempt to
 	 * acquire using the same strategy as the TIMEWAIT case above.  If we
 	 * relock, we have to jump back to 'relocked' as the connection might
 	 * now be in TIMEWAIT.
 	 */
 #ifdef INVARIANTS
 	if ((thflags & (TH_FIN | TH_RST)) != 0)
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 #endif
 	if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
 	      (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
 	       !IS_FASTOPEN(tp->t_flags)))) {
 		if (ti_locked == TI_UNLOCKED) {
-			if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
-				in_pcbref(inp);
-				INP_WUNLOCK(inp);
-				INP_INFO_RLOCK(&V_tcbinfo);
-				ti_locked = TI_RLOCKED;
-				INP_WLOCK(inp);
-				if (in_pcbrele_wlocked(inp)) {
-					inp = NULL;
-					goto findpcb;
-				} else if (inp->inp_flags & INP_DROPPED) {
-					INP_WUNLOCK(inp);
-					inp = NULL;
-					goto findpcb;
-				}
-				goto relocked;
-			} else
-				ti_locked = TI_RLOCKED;
+			INP_INFO_RLOCK();
+			ti_locked = TI_RLOCKED;
 		}
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	}
 
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
 		goto dropunlock;
 #endif
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: so == NULL", __func__));
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG) {
 		ostate = tp->t_state;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
 		} else
 #endif
 			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
 		tcp_savetcp = *th;
 	}
 #endif /* TCPDEBUG */
 	/*
 	 * When the socket is accepting connections (the INPCB is in LISTEN
 	 * state) we look into the SYN cache if this is a new connection
 	 * attempt or the completion of a previous one.
 	 */
 	KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN),
 	    ("%s: so accepting but tp %p not listening", __func__, tp));
 	if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) {
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 #ifdef INET6
 		if (isipv6) {
 			inc.inc_flags |= INC_ISIPV6;
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 		} else
 #endif
 		{
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 		inc.inc_fibnum = so->so_fibnum;
 
 		/*
 		 * Check for an existing connection attempt in syncache if
 		 * the flag is only ACK.  A successful lookup creates a new
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
 			 * timestamp.
 			 */
 			tcp_dooptions(&to, optp, optlen, 0);
 			/*
 			 * NB: syncache_expand() doesn't unlock
 			 * inp and tcpinfo locks.
 			 */
 			rstreason = syncache_expand(&inc, &to, th, &so, m);
 			if (rstreason < 0) {
 				/*
 				 * A failing TCP MD5 signature comparison
 				 * must result in the segment being dropped
 				 * and must not produce any response back
 				 * to the sender.
 				 */
 				goto dropunlock;
 			} else if (rstreason == 0) {
 				/*
 				 * No syncache entry or ACK was not
 				 * for our SYN/ACK.  Send a RST.
 				 * NB: syncache did its own logging
 				 * of the failure cause.
 				 */
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 tfo_socket_result:
 			if (so == NULL) {
 				/*
 				 * We completed the 3-way handshake
 				 * but could not allocate a socket
 				 * either due to memory shortage,
 				 * listen queue length limits or
 				 * global socket limits.  Send RST
 				 * or wait and have the remote end
 				 * retransmit the ACK for another
 				 * try.
 				 */
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 					log(LOG_DEBUG, "%s; %s: Listen socket: "
 					    "Socket allocation failed due to "
 					    "limits or memory shortage, %s\n",
 					    s, __func__,
 					    V_tcp_sc_rst_sock_fail ?
 					    "sending RST" : "try again");
 				if (V_tcp_sc_rst_sock_fail) {
 					rstreason = BANDLIM_UNLIMITED;
 					goto dropwithreset;
 				} else
 					goto dropunlock;
 			}
 			/*
 			 * Socket is created in state SYN_RECEIVED.
 			 * Unlock the listen socket, lock the newly
 			 * created socket and update the tp variable.
 			 */
 			INP_WUNLOCK(inp);	/* listen socket */
 			inp = sotoinpcb(so);
 			/*
 			 * New connection inpcb is already locked by
 			 * syncache_expand().
 			 */
 			INP_WLOCK_ASSERT(inp);
 			tp = intotcpcb(inp);
 			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
 			    ("%s: ", __func__));
 			/*
 			 * Process the segment and the data it
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return (IPPROTO_DONE);
 		}
 		/*
 		 * Segment flag validation for new connection attempts:
 		 *
 		 * Our (SYN|ACK) response was rejected.
 		 * Check with syncache and remove entry to prevent
 		 * retransmits.
 		 *
 		 * NB: syncache_chkrst does its own logging of failure
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
 			syncache_chkrst(&inc, th);
 			goto dropunlock;
 		}
 		/*
 		 * We can't do anything without SYN.
 		 */
 		if ((thflags & TH_SYN) == 0) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN is missing, segment ignored\n",
 				    s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * (SYN|ACK) is bogus on a listen socket.
 		 */
 		if (thflags & TH_ACK) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
 			syncache_badack(&inc);	/* XXX: Not needed! */
 			TCPSTAT_INC(tcps_badsyn);
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
 		}
 		/*
 		 * If the drop_synfin option is enabled, drop all
 		 * segments with both the SYN and FIN bits set.
 		 * This prevents e.g. nmap from identifying the
 		 * TCP/IP stack.
 		 * XXX: Poor reasoning.  nmap has other methods
 		 * and is constantly refining its stack detection
 		 * strategies.
 		 * XXX: This is a violation of the TCP specification
 		 * and was used by RFC1644.
 		 */
 		if ((thflags & TH_FIN) && V_drop_synfin) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|FIN segment ignored (based on "
 				    "sysctl setting)\n", s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 *
 		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
 		 * as they do not affect the state of the TCP FSM.
 		 * The data pointed to by TH_URG and th_urp is ignored.
 		 */
 		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
 		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
 		KASSERT(thflags & (TH_SYN),
 		    ("%s: Listen socket: TH_SYN not set", __func__));
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !V_ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 			if (ia6 != NULL &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				ifa_free(&ia6->ia_ifa);
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (ia6)
 				ifa_free(&ia6->ia_ifa);
 		}
 #endif /* INET6 */
 		/*
 		 * Basic sanity checks on incoming SYN requests:
 		 *   Don't respond if the destination is a link layer
 		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
 		 *   If it is from this socket it must be forged.
 		 *   Don't respond if the source or destination is a
 		 *	global or subnet broad- or multicast address.
 		 *   Note that it is quite possible to receive unicast
 		 *	link-layer packets with a broadcast IP address. Use
 		 *	in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST)) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 			    log(LOG_DEBUG, "%s; %s: Listen socket: "
 				"Connection attempt from broad- or multicast "
 				"link layer address ignored\n", s, __func__);
 			goto dropunlock;
 		}
 #ifdef INET6
 		if (isipv6) {
 			if (th->th_dport == th->th_sport &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to/from self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to multicast "
 					"address ignored\n", s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (th->th_dport == th->th_sport &&
 			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to broad- "
 					"or multicast address ignored\n",
 					s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 		/*
 		 * SYN appears to be valid.  Create compressed TCP state
 		 * for syncache.
 		 */
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG)
 			tcp_trace(TA_INPUT, ostate, tp,
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		TCP_PROBE3(debug__input, tp, th, m);
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
 		if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL))
 			goto tfo_socket_result;
 
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Only the listen socket is unlocked by syncache_add().
 		 */
 		if (ti_locked == TI_RLOCKED) {
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 			ti_locked = TI_UNLOCKED;
 		}
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		return (IPPROTO_DONE);
 	} else if (tp->t_state == TCPS_LISTEN) {
 		/*
 		 * When a listen socket is torn down the SO_ACCEPTCONN
 		 * flag is removed first while connections are drained
 		 * from the accept queue in a unlock/lock cycle of the
 		 * ACCEPT_LOCK, opening a race condition allowing a SYN
 		 * attempt go through unhandled.
 		 */
 		goto dropunlock;
 	}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (tp->t_flags & TF_SIGNATURE) {
 		tcp_dooptions(&to, optp, optlen, thflags);
 		if ((to.to_flags & TOF_SIGNATURE) == 0) {
 			TCPSTAT_INC(tcps_sig_err_nosigopt);
 			goto dropunlock;
 		}
 		if (!TCPMD5_ENABLED() ||
 		    TCPMD5_INPUT(m, th, to.to_signature) != 0)
 			goto dropunlock;
 	}
 #endif
 	TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
 	/*
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
 	tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return (IPPROTO_DONE);
 
 dropwithreset:
 	TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
 	if (ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else {
 		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
 		    "ti_locked: %d", __func__, ti_locked));
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	m = NULL;	/* mbuf chain got consumed. */
 	goto drop;
 
 dropunlock:
 	if (m != NULL)
 		TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
 	if (ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else {
 		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
 		    "ti_locked: %d", __func__, ti_locked));
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
 drop:
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	if (m != NULL)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Automatic sizing of receive socket buffer.  Often the send
  * buffer size is not optimally adjusted to the actual network
  * conditions at hand (delay bandwidth product).  Setting the
  * buffer size too small limits throughput on links with high
  * bandwidth and high delay (eg. trans-continental/oceanic links).
  *
  * On the receive side the socket buffer memory is only rarely
  * used to any significant extent.  This allows us to be much
  * more aggressive in scaling the receive socket buffer.  For
  * the case that the buffer space is actually used to a large
  * extent and we run out of kernel memory we can simply drop
  * the new segments; TCP on the sender will just retransmit it
  * later.  Setting the buffer size too big may only consume too
  * much kernel memory if the application doesn't read() from
  * the socket or packet loss or reordering makes use of the
  * reassembly queue.
  *
  * The criteria to step up the receive buffer one notch are:
  *  1. Application has not set receive buffer size with
  *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
  *  2. the number of bytes received during the time it takes
  *     one timestamp to be reflected back to us (the RTT);
  *  3. received bytes per RTT is within seven eighth of the
  *     current socket buffer size;
  *  4. receive buffer size has not hit maximal automatic size;
  *
  * This algorithm does one step per RTT at most and only if
  * we receive a bulk stream w/o packet losses or reorderings.
  * Shrinking the buffer during idle times is not necessary as
  * it doesn't consume any memory when idle.
  *
  * TODO: Only step up if the application is actually serving
  * the buffer to better manage the socket buffer resources.
  */
 int
 tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int tlen)
 {
 	int newsize = 0;
 
 	if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
 	    tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
 	    TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
 	    (tp->t_srtt >> TCP_RTT_SHIFT)) {
 		if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
 		    so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
 			newsize = min(so->so_rcv.sb_hiwat +
 			    V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
 		}
 		TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
 
 		/* Start over with next RTT. */
 		tp->rfbuf_ts = 0;
 		tp->rfbuf_cnt = 0;
 	} else {
 		tp->rfbuf_cnt += tlen;	/* add up */
 	}
 
 	return (newsize);
 }
 
 void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
 {
 	int thflags, acked, ourfinisacked, needoutput = 0, sack_changed;
 	int rstreason, todrop, win;
 	uint32_t tiwin;
 	uint16_t nsegs;
 	char *s;
 	struct in_conninfo *inc;
 	struct mbuf *mfree;
 	struct tcpopt to;
 	int tfo_syn;
 	
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 	thflags = th->th_flags;
 	inc = &tp->t_inpcb->inp_inc;
 	tp->sackhint.last_sack_ack = 0;
 	sack_changed = 0;
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a write lock on tcbinfo.  Otherwise, we
 	 * allow the tcbinfo to be in either alocked or unlocked, as the
 	 * caller may have unnecessarily acquired a write lock due to a race.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tp->t_state != TCPS_ESTABLISHED) {
 		KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
 		    "SYN/FIN/RST/!EST", __func__, ti_locked));
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	} else {
 #ifdef INVARIANTS
 		if (ti_locked == TI_RLOCKED)
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		else {
 			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
 			    "ti_locked: %d", __func__, ti_locked));
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		}
 #endif
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 
 #ifdef TCPPCAP
 	/* Save segment, if requested. */
 	tcp_pcap_add(th, m, &(tp->t_inpkts));
 #endif
 	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
 	    tlen, NULL, true);
 
 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: "
 			    "SYN|FIN segment ignored (based on "
 			    "sysctl setting)\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 		goto drop;
 	}
 
 	/*
 	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
 	 * check SEQ.ACK first.
 	 */
 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 * XXX: This should be done after segment
 	 * validation to ignore broken/spoofed segs.
 	 */
 	tp->t_rcvtime = ticks;
 
 	/*
 	 * Scale up the window into a 32-bit value.
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
 	 * TCP ECN processing.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			TCPSTAT_INC(tcps_ecn_ce);
 			break;
 		case IPTOS_ECN_ECT0:
 			TCPSTAT_INC(tcps_ecn_ect0);
 			break;
 		case IPTOS_ECN_ECT1:
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
 
 		/* Process a packet differently from RFC3168. */
 		cc_ecnpkt_handler(tp, th, iptos);
 
 		/* Congestion experienced. */
 		if (thflags & TH_ECE) {
 			cc_cong_signal(tp, th, CC_ECN);
 		}
 	}
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if ((tp->t_flags & TF_SIGNATURE) != 0 &&
 	    (to.to_flags & TOF_SIGNATURE) == 0) {
 		TCPSTAT_INC(tcps_sig_err_sigopt);
 		/* XXX: should drop? */
 	}
 #endif
 	/*
 	 * If echoed timestamp is later than the current time,
 	 * fall back to non RFC1323 RTT calculation.  Normalize
 	 * timestamp if syncookies were used when this connection
 	 * was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
 			to.to_tsecr = 0;
 		else if (tp->t_flags & TF_PREVVALID &&
 			 tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin))
 			cc_cong_signal(tp, th, CC_RTO_ERR);
 	}
 	/*
 	 * Process options only when we get SYN/ACK back. The SYN case
 	 * for incoming connections is handled in tcp_syncache.
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 		if ((to.to_flags & TOF_SCALE) &&
 		    (tp->t_flags & TF_REQ_SCALE)) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->snd_scale = to.to_wscale;
 		}
 		/*
 		 * Initial send window.  It will be updated with
 		 * the next incoming segment to the scaled value.
 		 */
 		tp->snd_wnd = th->th_win;
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (to.to_flags & TOF_SACKPERM) == 0)
 			tp->t_flags &= ~TF_SACK_PERMIT;
 		if (IS_FASTOPEN(tp->t_flags)) {
 			if (to.to_flags & TOF_FASTOPEN)
 				tcp_fastopen_update_cache(tp, to.to_mss,
 				    to.to_tfo_len, to.to_tfo_cookie);
 			else
 				tcp_fastopen_disable_path(tp);
 		}
 	}
 
 	/*
 	 * If timestamps were negotiated during SYN/ACK they should
 	 * appear on every segment during this session and vice versa.
 	 */
 	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED first, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    LIST_EMPTY(&tp->t_segq) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = tcp_ts_getticks();
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    !IN_RECOVERY(tp->t_flags) &&
 			    (to.to_flags & TOF_SACK) == 0 &&
 			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				if (ti_locked == TI_RLOCKED)
 					INP_INFO_RUNLOCK(&V_tcbinfo);
 				ti_locked = TI_UNLOCKED;
 
 				TCPSTAT_INC(tcps_predack);
 
 				/*
 				 * "bad retransmit" recovery without timestamps.
 				 */
 				if ((to.to_flags & TOF_TS) == 0 &&
 				    tp->t_rxtshift == 1 &&
 				    tp->t_flags & TF_PREVVALID &&
 				    (int)(ticks - tp->t_badrxtwin) < 0) {
 					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					uint32_t t;
 
 					t = tcp_ts_getticks() - to.to_tsecr;
 					if (!tp->t_rttlow || tp->t_rttlow > t)
 						tp->t_rttlow = t;
 					tcp_xmit_timer(tp,
 					    TCP_TS_TO_TICKS(t) + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				acked = BYTES_THIS_ACK(tp, th);
 
 #ifdef TCP_HHOOK
 				/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 				hhook_run_tcp_est_in(tp, th, &to);
 #endif
 
 				TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
 				
 				/*
 				 * Let the congestion control algorithm update
 				 * congestion control related information. This
 				 * typically means increasing the congestion
 				 * window.
 				 */
 				cc_ack_received(tp, th, nsegs, CC_ACK);
 
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 #ifdef TCPDEBUG
 				if (so->so_options & SO_DEBUG)
 					tcp_trace(TA_INPUT, ostate, tp,
 					    (void *)tcp_saveipgen,
 					    &tcp_savetcp, 0);
 #endif
 				TCP_PROBE3(debug__input, tp, th, m);
 				if (tp->snd_una == tp->snd_max)
 					tcp_timer_activate(tp, TT_REXMT, 0);
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (sbavail(&so->so_snd))
 					(void) tp->t_fb->tfb_tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
 			/*
 			 * This is a pure, in-sequence data packet with
 			 * nothing on the reassembly queue and we have enough
 			 * buffer space to take it.
 			 */
 			if (ti_locked == TI_RLOCKED)
 				INP_INFO_RUNLOCK(&V_tcbinfo);
 			ti_locked = TI_UNLOCKED;
 
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
 			TCPSTAT_INC(tcps_preddat);
 			tp->rcv_nxt += tlen;
 			/*
 			 * Pull snd_wl1 up to prevent seq wrap relative to
 			 * th_seq.
 			 */
 			tp->snd_wl1 = th->th_seq;
 			/*
 			 * Pull rcv_up up to prevent seq wrap relative to
 			 * rcv_nxt.
 			 */
 			tp->rcv_up = tp->rcv_nxt;
 			TCPSTAT_ADD(tcps_rcvpack, nsegs);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 			TCP_PROBE3(debug__input, tp, th, m);
 
 			newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				/*
 				 * Set new socket buffer size.
 				 * Give up when limit is reached.
 				 */
 				if (newsize)
 					if (!sbreserve_locked(&so->so_rcv,
 					    newsize, so, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m, 0);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 			if (DELAY_ACK(tp, tlen)) {
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tp->t_fb->tfb_tcp_output(tp);
 			}
 			goto check_delack;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		if (IS_FASTOPEN(tp->t_flags)) {
 			/*
 			 * When a TFO connection is in SYN_RECEIVED, the
 			 * only valid packets are the initial SYN, a
 			 * retransmit/copy of the initial SYN (possibly with
 			 * a subset of the original data), a valid ACK, a
 			 * FIN, or a RST.
 			 */
 			if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			} else if (thflags & TH_SYN) {
 				/* non-initial SYN is ignored */
 				if ((tcp_timer_active(tp, TT_DELACK) || 
 				     tcp_timer_active(tp, TT_REXMT)))
 					goto drop;
 			} else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
 				goto drop;
 			}
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains a RST with valid ACK (SEQ.ACK has already
 	 *	    been verified), then drop the connection.
 	 *	if seg contains a RST without an ACK, drop the seg.
 	 *	if seg does not contain SYN, then drop the seg.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if seg contains an ECE and ECN support is enabled, the stream
 	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
 			TCP_PROBE5(connect__refused, NULL, tp,
 			    m, tp, th);
 			tp = tcp_drop(tp, ECONNREFUSED);
 		}
 		if (thflags & TH_RST)
 			goto drop;
 		if (!(thflags & TH_SYN))
 			goto drop;
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			int tfo_partial_ack = 0;
 
 			TCPSTAT_INC(tcps_connects);
 			soisconnected(so);
 #ifdef MAC
 			mac_socketpeer_set_from_mbuf(m, so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			tp->rcv_adv += min(tp->rcv_wnd,
 			    TCP_MAXWIN << tp->rcv_scale);
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If not all the data that was sent in the TFO SYN
 			 * has been acked, resend the remainder right away.
 			 */
 			if (IS_FASTOPEN(tp->t_flags) &&
 			    (tp->snd_una != tp->snd_max)) {
 				tp->snd_nxt = th->th_ack;
 				tfo_partial_ack = 1;
 			}
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
 				tcp_timer_activate(tp, TT_DELACK,
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
 
 			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 				tp->t_flags |= TF_ECN_PERMIT;
 				TCPSTAT_INC(tcps_ecn_shs);
 			}
 			
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tcp_state_change(tp, TCPS_FIN_WAIT_1);
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tcp_state_change(tp, TCPS_ESTABLISHED);
 				TCP_PROBE5(connect__established, NULL, tp,
 				    m, tp, th);
 				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP,
 				    TP_KEEPIDLE(tp));
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
 			 *        SYN-SENT* -> SYN-RECEIVED*
 			 */
 			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tcp_state_change(tp, TCPS_SYN_RECEIVED);
 		}
 
 		KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
 		    "ti_locked %d", __func__, ti_locked));
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			TCPSTAT_INC(tcps_rcvpackafterwin);
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
 		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *      do normal processing.
 	 *
 	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 */
 	if (thflags & TH_RST) {
 		/*
 		 * RFC5961 Section 3.2
 		 *
 		 * - RST drops connection only if SEG.SEQ == RCV.NXT.
 		 * - If RST is in window, we send challenge ACK.
 		 *
 		 * Note: to take into account delayed ACKs, we should
 		 *   test against last_ack_sent instead of rcv_nxt.
 		 * Note 2: we handle special case of closed window, not
 		 *   covered by the RFC.
 		 */
 		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(ti_locked == TI_RLOCKED,
 			    ("%s: TH_RST ti_locked %d, th %p tp %p",
 			    __func__, ti_locked, th, tp));
 			KASSERT(tp->t_state != TCPS_SYN_SENT,
 			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 			    __func__, th, tp));
 
 			if (V_tcp_insecure_rst ||
 			    tp->last_ack_sent == th->th_seq) {
 				TCPSTAT_INC(tcps_drops);
 				/* Drop the connection. */
 				switch (tp->t_state) {
 				case TCPS_SYN_RECEIVED:
 					so->so_error = ECONNREFUSED;
 					goto close;
 				case TCPS_ESTABLISHED:
 				case TCPS_FIN_WAIT_1:
 				case TCPS_FIN_WAIT_2:
 				case TCPS_CLOSE_WAIT:
 				case TCPS_CLOSING:
 				case TCPS_LAST_ACK:
 					so->so_error = ECONNRESET;
 				close:
 					/* FALLTHROUGH */
 				default:
 					tp = tcp_close(tp);
 				}
 			} else {
 				TCPSTAT_INC(tcps_badrst);
 				/* Send challenge ACK. */
 				tcp_respond(tp, mtod(m, void *), th, m,
 				    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 				tp->last_ack_sent = tp->rcv_nxt;
 				m = NULL;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC5961 Section 4.2
 	 * Send challenge ACK for any SYN in synchronized state.
 	 */
 	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
 	    tp->t_state != TCPS_SYN_RECEIVED) {
 		KASSERT(ti_locked == TI_RLOCKED,
 		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
 		TCPSTAT_INC(tcps_badsyn);
 		if (V_tcp_insecure_syn &&
 		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			tp = tcp_drop(tp, ECONNRESET);
 			rstreason = BANDLIM_UNLIMITED;
 		} else {
 			/* Send challenge ACK. */
 			tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 			    tp->snd_nxt, TH_ACK);
 			tp->last_ack_sent = tp->rcv_nxt;
 			m = NULL;
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 			TCPSTAT_INC(tcps_pawsdrop);
 			if (tlen)
 				goto dropafterack;
 			goto drop;
 		}
 	}
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
 		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
 			    "after socket was closed, "
 			    "sending RST and removing tcpcb\n",
 			    s, __func__, tcpstates[tp->t_state], tlen);
 			free(s, M_TCPLOG);
 		}
 		tp = tcp_close(tp);
 		TCPSTAT_INC(tcps_rcvafterclose);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				TCPSTAT_INC(tcps_rcvwinprobe);
 			} else
 				goto dropafterack;
 		} else
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE: 
 	 * 1) That the test incorporates suggestions from the latest
 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 * 2) That updating only on newer timestamps interferes with
 	 *    our earlier PAWS tests, so this check should be solely
 	 *    predicated on the sequence space of this segment.
 	 * 3) That we modify the segment boundary check to be 
 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 	 *    instead of RFC1323's
 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 	 *    This modified check allows us to overcome RFC1323's
 	 *    limitations as described in Stevens TCP/IP Illustrated
 	 *    Vol. 2 p.869. In such cases, we can still calculate the
 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		((thflags & (TH_SYN|TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN)) {
 			if (tp->t_state == TCPS_SYN_RECEIVED &&
 			    IS_FASTOPEN(tp->t_flags)) {
 				tp->snd_wnd = tiwin;
 				cc_conn_init(tp);
 			}
 			goto step6;
 		} else if (tp->t_flags & TF_ACKNOW)
 			goto dropafterack;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			tp->snd_wnd = tiwin;
 		}
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(accept__established, NULL, tp,
 			    m, tp, th);
 			if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
 				tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 				tp->t_tfo_pending = NULL;
 
 				/*
 				 * Account for the ACK of our SYN prior to
 				 * regular ACK processing below.
 				 */ 
 				tp->snd_una++;
 			}
 			/*
 			 * TFO connections call cc_conn_init() during SYN
 			 * processing.  Calling it again here for such
 			 * connections is not harmless as it would undo the
 			 * snd_cwnd reduction that occurs when a TFO SYN|ACK
 			 * is retransmitted.
 			 */
 			if (!IS_FASTOPEN(tp->t_flags))
 				cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			TCPSTAT_INC(tcps_rcvacktoomuch);
 			goto dropafterack;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    ((to.to_flags & TOF_SACK) ||
 		     !TAILQ_EMPTY(&tp->snd_holes)))
 			sack_changed = tcp_sack_doack(tp, &to, th->th_ack);
 		else
 			/*
 			 * Reset the value so that previous (valid) value
 			 * from the last ack with SACK doesn't get used.
 			 */
 			tp->sackhint.sacked_bytes = 0;
 
 #ifdef TCP_HHOOK
 		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 		hhook_run_tcp_est_in(tp, th, &to);
 #endif
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			u_int maxseg;
 
 			maxseg = tcp_maxseg(tp);
 			if (tlen == 0 &&
 			    (tiwin == tp->snd_wnd ||
 			    (tp->t_flags & TF_SACK_PERMIT))) {
 				/*
 				 * If this is the first time we've seen a
 				 * FIN from the remote, this is not a
 				 * duplicate and it needs to be processed
 				 * normally.  This happens during a
 				 * simultaneous close.
 				 */
 				if ((thflags & TH_FIN) &&
 				    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
 					tp->t_dupacks = 0;
 					break;
 				}
 				TCPSTAT_INC(tcps_rcvdupack);
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change and FIN isn't set),
 				 * the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				/*
 				 * Following 2 kinds of acks should not affect
 				 * dupack counting:
 				 * 1) Old acks
 				 * 2) Acks with SACK but without any new SACK
 				 * information in them. These could result from
 				 * any anomaly in the network like a switch
 				 * duplicating packets or a possible DoS attack.
 				 */
 				if (th->th_ack != tp->snd_una ||
 				    ((tp->t_flags & TF_SACK_PERMIT) &&
 				    !sack_changed))
 					break;
 				else if (!tcp_timer_active(tp, TT_REXMT))
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				     IN_FASTRECOVERY(tp->t_flags)) {
 					cc_ack_received(tp, th, nsegs,
 					    CC_DUPACK);
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 						
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff 
 						 * we have less than 1/2 the original window's
 						 * worth of data in flight.
 						 */
 						if (V_tcp_do_rfc6675_pipe)
 							awnd = tcp_compute_pipe(tp);
 						else
 							awnd = (tp->snd_nxt - tp->snd_fack) +
 								tp->sackhint.sack_bytes_rexmit;
 
 						if (awnd < tp->snd_ssthresh) {
 							tp->snd_cwnd += maxseg;
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
 					} else
 						tp->snd_cwnd += maxseg;
 					(void) tp->t_fb->tfb_tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
 					/* Congestion signal before ack. */
 					cc_cong_signal(tp, th, CC_NDUPACK);
 					cc_ack_received(tp, th, nsegs,
 					    CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						TCPSTAT_INC(
 						    tcps_sack_recovery_episode);
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = maxseg;
 						(void) tp->t_fb->tfb_tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = maxseg;
 					(void) tp->t_fb->tfb_tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
 					/*
 					 * Process first and second duplicate
 					 * ACKs. Each indicates a segment
 					 * leaving the network, creating room
 					 * for more. Make sure we can send a
 					 * packet on reception of each duplicate
 					 * ACK by increasing snd_cwnd by one
 					 * segment. Restore the original
 					 * snd_cwnd after packet transmission.
 					 */
 					cc_ack_received(tp, th, nsegs,
 					    CC_DUPACK);
 					uint32_t oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 					int avail;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
 					tp->snd_cwnd =
 					    (tp->snd_nxt - tp->snd_una) +
 					    (tp->t_dupacks - tp->snd_limited) *
 					    maxseg;
 					/*
 					 * Only call tcp_output when there
 					 * is new data available to be sent.
 					 * Otherwise we would send pure ACKs.
 					 */
 					SOCKBUF_LOCK(&so->so_snd);
 					avail = sbavail(&so->so_snd) -
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0)
 						(void) tp->t_fb->tfb_tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
 						    tp->snd_limited == 0) ||
 						   (sent == maxseg + 1 &&
 						    tp->t_flags & TF_SENTFIN),
 						    ("%s: sent too much",
 						    __func__));
 						tp->snd_limited = 2;
 					} else if (sent > 0)
 						++tp->snd_limited;
 					tp->snd_cwnd = oldcwnd;
 					goto drop;
 				}
 			}
 			break;
 		} else {
 			/*
 			 * This ack is advancing the left edge, reset the
 			 * counter.
 			 */
 			tp->t_dupacks = 0;
 			/*
 			 * If this ack also has new SACK info, increment the
 			 * counter as per rfc6675.
 			 */
 			if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed)
 				tp->t_dupacks++;
 		}
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				if (tp->t_flags & TF_SACK_PERMIT)
 					tcp_sack_partialack(tp, th);
 				else
 					tcp_newreno_partial_ack(tp, th);
 			} else
 				cc_post_recovery(tp, th);
 		}
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		acked = BYTES_THIS_ACK(tp, th);
 		KASSERT(acked >= 0, ("%s: acked unexepectedly negative "
 		    "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__,
 		    tp->snd_una, th->th_ack, tp, m));
 		TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 &&
 		    tp->t_flags & TF_PREVVALID &&
 		    tp->t_badrxtwin &&
 		    SEQ_LT(to.to_tsecr, tp->t_badrxtwin))
 			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
 			uint32_t t;
 
 			t = tcp_ts_getticks() - to.to_tsecr;
 			if (!tp->t_rttlow || tp->t_rttlow > t)
 				tp->t_rttlow = t;
 			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 				tp->t_rttlow = ticks - tp->t_rtttime;
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			needoutput = 1;
 		} else if (!tcp_timer_active(tp, TT_PERSIST))
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * Let the congestion control algorithm update congestion
 		 * control related information. This typically means increasing
 		 * the congestion window.
 		 */
 		cc_ack_received(tp, th, nsegs, CC_ACK);
 
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > sbavail(&so->so_snd)) {
 			if (tp->snd_wnd >= sbavail(&so->so_snd))
 				tp->snd_wnd -= sbavail(&so->so_snd);
 			else
 				tp->snd_wnd = 0;
 			mfree = sbcut_locked(&so->so_snd,
 			    (int)sbavail(&so->so_snd));
 			ourfinisacked = 1;
 		} else {
 			mfree = sbcut_locked(&so->so_snd, acked);
 			if (tp->snd_wnd >= (uint32_t) acked)
 				tp->snd_wnd -= acked;
 			else
 				tp->snd_wnd = 0;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		m_freem(mfree);
 		/* Detect una wraparound. */
 		if (!IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
 		/* XXXLAS: Can this be moved up into cc_post_recovery? */
 		if (IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			EXIT_RECOVERY(tp->t_flags);
 		}
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 *
 				 * XXXjl:
 				 * we should release the tp also, and use a
 				 * compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 			break;
 
 		/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 				tcp_twstart(tp);
 				INP_INFO_RUNLOCK(&V_tcbinfo);
 				m_freem(m);
 				return;
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 		}
 	}
 
 step6:
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = sbavail(&so->so_rcv) +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (uint32_t)tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		   IS_FASTOPEN(tp->t_flags));
 	if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		     tfo_syn)) {
 			if (DELAY_ACK(tp, tlen) || tfo_syn)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			TCPSTAT_INC(tcps_rcvpack);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m, 0);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
 			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 * XXX: Unused.
 		 */
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
 			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 		else
 			len = so->so_rcv.sb_hiwat;
 #endif
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 		/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 		/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 		/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
 			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
 			    ti_locked));
 
 			tcp_twstart(tp);
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 			return;
 		}
 	}
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	TCP_PROBE3(debug__input, tp, th, m);
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tp->t_fb->tfb_tcp_output(tp);
 
 check_delack:
 	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
 	    __func__, ti_locked));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 	INP_WUNLOCK(tp->t_inpcb);
 	return;
 
 dropafterack:
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	TCP_PROBE3(debug__input, tp, th, m);
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 	tp->t_flags |= TF_ACKNOW;
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
 
 dropwithreset:
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	return;
 
 drop:
 	if (ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 #endif
 
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	TCP_PROBE3(debug__input, tp, th, m);
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Issue RST and make ACK acceptable to originator of segment.
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
 void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 
 	if (tp != NULL) {
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 	}
 
 	/* Don't bother if destination was broadcast/multicast. */
 	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 #ifdef INET6
 	if (mtod(m, struct ip *)->ip_v == 6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 		/* IPv6 anycast check is done at tcp6_input() */
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 #endif
 
 	/* Perform bandwidth limiting. */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
 
 	/* tcp_respond consumes the mbuf chain. */
 	if (th->th_flags & TH_ACK) {
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 		    th->th_ack, TH_RST);
 	} else {
 		if (th->th_flags & TH_SYN)
 			tlen++;
 		if (th->th_flags & TH_FIN)
 			tlen++;
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 		    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	return;
 drop:
 	m_freem(m);
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 		case TCPOPT_SIGNATURE:
 			/*
 			 * In order to reply to a host which has set the
 			 * TCP_SIGNATURE option in its initial SYN, we have
 			 * to record the fact that the option was observed
 			 * here for the syncache code to perform the correct
 			 * response.
 			 */
 			if (optlen != TCPOLEN_SIGNATURE)
 				continue;
 			to->to_flags |= TOF_SIGNATURE;
 			to->to_signature = cp + 2;
 			break;
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen != TCPOLEN_SACK_PERMITTED)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_do_sack)
 				continue;
 			to->to_flags |= TOF_SACKPERM;
 			break;
 		case TCPOPT_SACK:
 			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 				continue;
 			if (flags & TO_SYN)
 				continue;
 			to->to_flags |= TOF_SACK;
 			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 			to->to_sacks = cp + 2;
 			TCPSTAT_INC(tcps_sack_rcv_blocks);
 			break;
 		case TCPOPT_FAST_OPEN:
 			/*
 			 * Cookie length validation is performed by the
 			 * server side cookie checking code or the client
 			 * side cookie cache update code.
 			 */
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_fastopen_client_enable &&
 			    !V_tcp_fastopen_server_enable)
 				continue;
 			to->to_flags |= TOF_FASTOPEN;
 			to->to_tfo_len = optlen - 2;
 			to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == NULL)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	int delta;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	TCPSTAT_INC(tcps_rttupdated);
 	tp->t_rttupdated++;
 	if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing interface
  * without forcing IP to fragment.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * NOTE that resulting t_maxseg doesn't include space for TCP options or
  * IP options, e.g. IPSEC data, since length of this data may vary, and
  * thus it is calculated for every segment separately in tcp_output().
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
  * settings are handled in tcp_mssopt().
  */
 void
 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
     struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
 {
 	int mss = 0;
 	uint32_t maxmtu = 0;
 	struct inpcb *inp = tp->t_inpcb;
 	struct hc_metrics_lite metrics;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (mtuoffer != -1) {
 		KASSERT(offer == -1, ("%s: conflict", __func__));
 		offer = mtuoffer - min_protoh;
 	}
 
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
 		maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
 		tp->t_maxseg = V_tcp_v6mssdflt;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
 		tp->t_maxseg = V_tcp_mssdflt;
 	}
 #endif
 
 	/*
 	 * No route to sender, stay with default mss and return.
 	 */
 	if (maxmtu == 0) {
 		/*
 		 * In case we return early we need to initialize metrics
 		 * to a defined state as tcp_hc_get() would do for us
 		 * if there was no cache hit.
 		 */
 		if (metricptr != NULL)
 			bzero(metricptr, sizeof(struct hc_metrics_lite));
 		return;
 	}
 
 	/* What have we got? */
 	switch (offer) {
 		case 0:
 			/*
 			 * Offer == 0 means that there was no MSS on the SYN
 			 * segment, in this case we use tcp_mssdflt as
 			 * already assigned to t_maxseg above.
 			 */
 			offer = tp->t_maxseg;
 			break;
 
 		case -1:
 			/*
 			 * Offer == -1 means that we didn't receive SYN yet.
 			 */
 			/* FALLTHROUGH */
 
 		default:
 			/*
 			 * Prevent DoS attack with too small MSS. Round up
 			 * to at least minmss.
 			 */
 			offer = max(offer, V_tcp_minmss);
 	}
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
 	 */
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	if (metricptr != NULL)
 		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 
 	/*
 	 * If there's a discovered mtu in tcp hostcache, use it.
 	 * Else, use the link mtu.
 	 */
 	if (metrics.rmx_mtu)
 		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 	else {
 #ifdef INET6
 		if (isipv6) {
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, V_tcp_v6mssdflt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in_localaddr(inp->inp_faddr))
 				mss = min(mss, V_tcp_mssdflt);
 		}
 #endif
 		/*
 		 * XXX - The above conditional (mss = maxmtu - min_protoh)
 		 * probably violates the TCP spec.
 		 * The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 	}
 	mss = min(mss, offer);
 
 	/*
 	 * Sanity check: make sure that maxseg will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 *
 	 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
 	 */
 	mss = max(mss, 64);
 
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
 	int mss;
 	uint32_t bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	struct tcp_ifcap cap;
 
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 
 	bzero(&cap, sizeof(cap));
 	tcp_mss_update(tp, offer, -1, &metrics, &cap);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe)
 		bufsize = metrics.rmx_sendpipe;
 	else
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	/*
 	 * Sanity check: make sure that maxseg will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 *
 	 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
 	 */
 	tp->t_maxseg = max(mss, 64);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe)
 		bufsize = metrics.rmx_recvpipe;
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Check the interface for TSO capabilities. */
 	if (cap.ifcap & CSUM_TSO) {
 		tp->t_flags |= TF_TSO;
 		tp->t_tsomax = cap.tsomax;
 		tp->t_tsomaxsegcount = cap.tsomaxsegcount;
 		tp->t_tsomaxsegsize = cap.tsomaxsegsize;
 	}
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(struct in_conninfo *inc)
 {
 	int mss = 0;
 	uint32_t thcmtu = 0;
 	uint32_t maxmtu = 0;
 	size_t min_protoh;
 
 	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 #endif
 #if defined(INET6) || defined(INET)
 	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 #endif
 
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
 		mss = max(maxmtu, thcmtu) - min_protoh;
 
 	return (mss);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
 	uint32_t ocwnd = tp->snd_cwnd;
 	u_int maxseg = tcp_maxseg(tp);
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
 		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += maxseg;
 }
 
 int
 tcp_compute_pipe(struct tcpcb *tp)
 {
 	return (tp->snd_max - tp->snd_una +
 		tp->sackhint.sack_bytes_rexmit -
 		tp->sackhint.sacked_bytes);
 }
Index: head/sys/netinet/tcp_stacks/rack.c
===================================================================
--- head/sys/netinet/tcp_stacks/rack.c	(revision 335355)
+++ head/sys/netinet/tcp_stacks/rack.c	(revision 335356)
@@ -1,9214 +1,9188 @@
 /*-
  * Copyright (c) 2016-2018
  *	Netflix Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #ifdef NETFLIX_STATS
 #include <sys/stats.h>
 #endif
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #ifdef NETFLIX_CWV
 #include <netinet/tcp_newcwv.h>
 #endif
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif				/* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif				/* IPSEC */
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 #include "sack_filter.h"
 #include "tcp_rack.h"
 #include "rack_bbr_common.h"
 
 uma_zone_t rack_zone;
 uma_zone_t rack_pcb_zone;
 
 #ifndef TICKS2SBT
 #define	TICKS2SBT(__t)	(tick_sbt * ((sbintime_t)(__t)))
 #endif
 
 struct sysctl_ctx_list rack_sysctl_ctx;
 struct sysctl_oid *rack_sysctl_root;
 
 #define CUM_ACKED 1
 #define SACKED 2
 
 /*
  * The RACK module incorporates a number of
  * TCP ideas that have been put out into the IETF
  * over the last few years:
  * - Matt Mathis's Rate Halving which slowly drops
  *    the congestion window so that the ack clock can
  *    be maintained during a recovery.
  * - Yuchung Cheng's RACK TCP (for which its named) that
  *    will stop us using the number of dup acks and instead
  *    use time as the gage of when we retransmit.
  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
  *    of Dukkipati et.al.
  * RACK depends on SACK, so if an endpoint arrives that
  * cannot do SACK the state machine below will shuttle the
  * connection back to using the "default" TCP stack that is
  * in FreeBSD.
  *
  * To implement RACK the original TCP stack was first decomposed
  * into a functional state machine with individual states
  * for each of the possible TCP connection states. The do_segement
  * functions role in life is to mandate the connection supports SACK
  * initially and then assure that the RACK state matches the conenction
  * state before calling the states do_segment function. Each
  * state is simplified due to the fact that the original do_segment
  * has been decomposed and we *know* what state we are in (no
  * switches on the state) and all tests for SACK are gone. This
  * greatly simplifies what each state does.
  *
  * TCP output is also over-written with a new version since it
  * must maintain the new rack scoreboard.
  *
  */
 static int32_t rack_precache = 1;
 static int32_t rack_tlp_thresh = 1;
 static int32_t rack_reorder_thresh = 2;
 static int32_t rack_reorder_fade = 60000;	/* 0 - never fade, def 60,000
 						 * - 60 seconds */
 static int32_t rack_pkt_delay = 1;
 static int32_t rack_inc_var = 0;/* For TLP */
 static int32_t rack_reduce_largest_on_idle = 0;
 static int32_t rack_min_pace_time = 0;
 static int32_t rack_min_pace_time_seg_req=6;
 static int32_t rack_early_recovery = 1;
 static int32_t rack_early_recovery_max_seg = 6;
 static int32_t rack_send_a_lot_in_prr = 1;
 static int32_t rack_min_to = 1;	/* Number of ms minimum timeout */
 static int32_t rack_tlp_in_recovery = 1;	/* Can we do TLP in recovery? */
 static int32_t rack_verbose_logging = 0;
 static int32_t rack_ignore_data_after_close = 1;
 /*
  * Currently regular tcp has a rto_min of 30ms
  * the backoff goes 12 times so that ends up
  * being a total of 122.850 seconds before a
  * connection is killed.
  */
 static int32_t rack_tlp_min = 10;
 static int32_t rack_rto_min = 30;	/* 30ms same as main freebsd */
 static int32_t rack_rto_max = 30000;	/* 30 seconds */
 static const int32_t rack_free_cache = 2;
 static int32_t rack_hptsi_segments = 40;
 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 static int32_t rack_pace_every_seg = 1;
 static int32_t rack_delayed_ack_time = 200;	/* 200ms */
 static int32_t rack_slot_reduction = 4;
 static int32_t rack_lower_cwnd_at_tlp = 0;
 static int32_t rack_use_proportional_reduce = 0;
 static int32_t rack_proportional_rate = 10;
 static int32_t rack_tlp_max_resend = 2;
 static int32_t rack_limited_retran = 0;
 static int32_t rack_always_send_oldest = 0;
 static int32_t rack_sack_block_limit = 128;
 static int32_t rack_use_sack_filter = 1;
 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 
 /* Rack specific counters */
 counter_u64_t rack_badfr;
 counter_u64_t rack_badfr_bytes;
 counter_u64_t rack_rtm_prr_retran;
 counter_u64_t rack_rtm_prr_newdata;
 counter_u64_t rack_timestamp_mismatch;
 counter_u64_t rack_reorder_seen;
 counter_u64_t rack_paced_segments;
 counter_u64_t rack_unpaced_segments;
 counter_u64_t rack_saw_enobuf;
 counter_u64_t rack_saw_enetunreach;
 
 /* Tail loss probe counters */
 counter_u64_t rack_tlp_tot;
 counter_u64_t rack_tlp_newdata;
 counter_u64_t rack_tlp_retran;
 counter_u64_t rack_tlp_retran_bytes;
 counter_u64_t rack_tlp_retran_fail;
 counter_u64_t rack_to_tot;
 counter_u64_t rack_to_arm_rack;
 counter_u64_t rack_to_arm_tlp;
 counter_u64_t rack_to_alloc;
 counter_u64_t rack_to_alloc_hard;
 counter_u64_t rack_to_alloc_emerg;
 
 counter_u64_t rack_sack_proc_all;
 counter_u64_t rack_sack_proc_short;
 counter_u64_t rack_sack_proc_restart;
 counter_u64_t rack_runt_sacks;
 counter_u64_t rack_used_tlpmethod;
 counter_u64_t rack_used_tlpmethod2;
 counter_u64_t rack_enter_tlp_calc;
 counter_u64_t rack_input_idle_reduces;
 counter_u64_t rack_tlp_does_nada;
 
 /* Temp CPU counters */
 counter_u64_t rack_find_high;
 
 counter_u64_t rack_progress_drops;
 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 
 static void
 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 
 static int
 rack_process_ack(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked,
     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 static int
 rack_process_data(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static void
 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 static struct rack_sendmap *
 rack_check_recovery_mode(struct tcpcb *tp,
     uint32_t tsused);
 static void
 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
     uint32_t type);
 static void rack_counter_destroy(void);
 static int
 rack_ctloutput(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp);
 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 static void
 rack_do_segment(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint8_t iptos, int32_t ti_locked);
 static void rack_dtor(void *mem, int32_t size, void *arg);
 static void
 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
     uint32_t t, uint32_t cts);
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack,
     struct rack_sendmap *rsm);
 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 static int
 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 static int32_t rack_handoff_ok(struct tcpcb *tp);
 static int32_t rack_init(struct tcpcb *tp);
 static void rack_init_sysctls(void);
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
     struct tcphdr *th);
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
     uint8_t pass, struct rack_sendmap *hintrsm);
 static void
 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm);
 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
 static int32_t rack_output(struct tcpcb *tp);
 static void
 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv);
 
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
     uint32_t cts);
 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 static void rack_remxt_tmr(struct tcpcb *tp);
 static int
 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 static int32_t rack_stopall(struct tcpcb *tp);
 static void
 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
     uint32_t delta);
 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 static uint32_t
 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 static void
 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts);
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 static void
 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
     struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val);
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_closing(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked);
 static void
 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
     struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val);
 static void
 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
     struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen);
 static int
 rack_do_established(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt);
 static int
 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf,
     int32_t * drop_hdrlen, int32_t * ret_val);
 static int
 rack_process_rst(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t * ti_locked);
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
     uint32_t tsused);
 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 static void
      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 
 static int
 rack_ts_check(struct mbuf *m, struct tcphdr *th,
     struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val);
 
 int32_t rack_clear_counter=0;
 
 
 static int
 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t stat;
 	int32_t error;
 
 	error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 	if (error || req->newptr == NULL)
 		return error;
 
 	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 	if (error)
 		return (error);
 	if (stat == 1) {
 #ifdef INVARIANTS
 		printf("Clearing RACK counters\n");
 #endif
 		counter_u64_zero(rack_badfr);
 		counter_u64_zero(rack_badfr_bytes);
 		counter_u64_zero(rack_rtm_prr_retran);
 		counter_u64_zero(rack_rtm_prr_newdata);
 		counter_u64_zero(rack_timestamp_mismatch);
 		counter_u64_zero(rack_reorder_seen);
 		counter_u64_zero(rack_tlp_tot);
 		counter_u64_zero(rack_tlp_newdata);
 		counter_u64_zero(rack_tlp_retran);
 		counter_u64_zero(rack_tlp_retran_bytes);
 		counter_u64_zero(rack_tlp_retran_fail);
 		counter_u64_zero(rack_to_tot);
 		counter_u64_zero(rack_to_arm_rack);
 		counter_u64_zero(rack_to_arm_tlp);
 		counter_u64_zero(rack_paced_segments);
 		counter_u64_zero(rack_unpaced_segments);
 		counter_u64_zero(rack_saw_enobuf);
 		counter_u64_zero(rack_saw_enetunreach);
 		counter_u64_zero(rack_to_alloc_hard);
 		counter_u64_zero(rack_to_alloc_emerg);
 		counter_u64_zero(rack_sack_proc_all);
 		counter_u64_zero(rack_sack_proc_short);
 		counter_u64_zero(rack_sack_proc_restart);
 		counter_u64_zero(rack_to_alloc);
 		counter_u64_zero(rack_find_high);
 		counter_u64_zero(rack_runt_sacks);
 		counter_u64_zero(rack_used_tlpmethod);
 		counter_u64_zero(rack_used_tlpmethod2);
 		counter_u64_zero(rack_enter_tlp_calc);
 		counter_u64_zero(rack_progress_drops);
 		counter_u64_zero(rack_tlp_does_nada);
 	}
 	rack_clear_counter = 0;
 	return (0);
 }
 
 
 
 static void
 rack_init_sysctls()
 {
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 	    &rack_rate_sample_method , USE_RTT_LOW,
 	    "What method should we use for rate sampling 0=high, 1=low ");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "data_after_close", CTLFLAG_RW,
 	    &rack_ignore_data_after_close, 0,
 	    "Do we hold off sending a RST until all pending data is ack'd");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlpmethod", CTLFLAG_RW,
 	    &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 	    "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "min_pace_time", CTLFLAG_RW,
 	    &rack_min_pace_time, 0,
 	    "Should we enforce a minimum pace time of 1ms");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "min_pace_segs", CTLFLAG_RW,
 	    &rack_min_pace_time_seg_req, 6,
 	    "How many segments have to be in the len to enforce min-pace-time");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
 	    &rack_reduce_largest_on_idle, 0,
 	    "Should we reduce the largest cwnd seen to IW on idle reduction");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "bb_verbose", CTLFLAG_RW,
 	    &rack_verbose_logging, 0,
 	    "Should RACK black box logging be verbose");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sackfiltering", CTLFLAG_RW,
 	    &rack_use_sack_filter, 1,
 	    "Do we use sack filtering?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "delayed_ack", CTLFLAG_RW,
 	    &rack_delayed_ack_time, 200,
 	    "Delayed ack time (200ms)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlpminto", CTLFLAG_RW,
 	    &rack_tlp_min, 10,
 	    "TLP minimum timeout per the specification (10ms)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "precache", CTLFLAG_RW,
 	    &rack_precache, 0,
 	    "Where should we precache the mcopy (0 is not at all)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sblklimit", CTLFLAG_RW,
 	    &rack_sack_block_limit, 128,
 	    "When do we start paying attention to small sack blocks");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "send_oldest", CTLFLAG_RW,
 	    &rack_always_send_oldest, 1,
 	    "Should we always send the oldest TLP and RACK-TLP");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
 	    &rack_tlp_in_recovery, 1,
 	    "Can we do a TLP during recovery?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 	    &rack_limited_retran, 0,
 	    "How many times can a rack timeout drive out sends");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "minrto", CTLFLAG_RW,
 	    &rack_rto_min, 0,
 	    "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "maxrto", CTLFLAG_RW,
 	    &rack_rto_max, 0,
 	    "Maxiumum RTO in ms -- should be at least as large as min_rto");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retry", CTLFLAG_RW,
 	    &rack_tlp_max_resend, 2,
 	    "How many times does TLP retry a single segment or multiple with no ACK");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 	    &rack_use_proportional_reduce, 0,
 	    "Should we proportionaly reduce cwnd based on the number of losses ");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "recovery_prop", CTLFLAG_RW,
 	    &rack_proportional_rate, 10,
 	    "What percent reduction per loss");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 	    &rack_lower_cwnd_at_tlp, 0,
 	    "When a TLP completes a retran should we enter recovery?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 	    &rack_slot_reduction, 4,
 	    "When setting a slot should we reduce by divisor");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 	    &rack_pace_every_seg, 1,
 	    "Should we pace out every segment hptsi");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 	    &rack_hptsi_segments, 6,
 	    "Should we pace out only a limited size of segments");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 	    &rack_send_a_lot_in_prr, 1,
 	    "Send a lot in prr");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "minto", CTLFLAG_RW,
 	    &rack_min_to, 1,
 	    "Minimum rack timeout in milliseconds");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
 	    &rack_early_recovery_max_seg, 6,
 	    "Max segments in early recovery");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 	    &rack_early_recovery, 1,
 	    "Do we do early recovery with rack");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 	    &rack_reorder_thresh, 2,
 	    "What factor for rack will be added when seeing reordering (shift right)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 	    &rack_tlp_thresh, 1,
 	    "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "reorder_fade", CTLFLAG_RW,
 	    &rack_reorder_fade, 0,
 	    "Does reorder detection fade, if so how many ms (0 means never)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "pktdelay", CTLFLAG_RW,
 	    &rack_pkt_delay, 1,
 	    "Extra RACK time (in ms) besides reordering thresh");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "inc_var", CTLFLAG_RW,
 	    &rack_inc_var, 0,
 	    "Should rack add to the TLP timer the variance in rtt calculation");
 	rack_badfr = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "badfr", CTLFLAG_RD,
 	    &rack_badfr, "Total number of bad FRs");
 	rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 	    &rack_badfr_bytes, "Total number of bad FRs");
 	rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prrsndret", CTLFLAG_RD,
 	    &rack_rtm_prr_retran,
 	    "Total number of prr based retransmits");
 	rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prrsndnew", CTLFLAG_RD,
 	    &rack_rtm_prr_newdata,
 	    "Total number of prr based new transmits");
 	rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tsnf", CTLFLAG_RD,
 	    &rack_timestamp_mismatch,
 	    "Total number of timestamps that we could not find the reported ts");
 	rack_find_high = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "findhigh", CTLFLAG_RD,
 	    &rack_find_high,
 	    "Total number of FIN causing find-high");
 	rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "reordering", CTLFLAG_RD,
 	    &rack_reorder_seen,
 	    "Total number of times we added delay due to reordering");
 	rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 	    &rack_tlp_tot,
 	    "Total number of tail loss probe expirations");
 	rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_new", CTLFLAG_RD,
 	    &rack_tlp_newdata,
 	    "Total number of tail loss probe sending new data");
 
 	rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retran", CTLFLAG_RD,
 	    &rack_tlp_retran,
 	    "Total number of tail loss probe sending retransmitted data");
 	rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 	    &rack_tlp_retran_bytes,
 	    "Total bytes of tail loss probe sending retransmitted data");
 	rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 	    &rack_tlp_retran_fail,
 	    "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 	rack_to_tot = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 	    &rack_to_tot,
 	    "Total number of times the rack to expired?");
 	rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "arm_rack", CTLFLAG_RD,
 	    &rack_to_arm_rack,
 	    "Total number of times the rack timer armed?");
 	rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "arm_tlp", CTLFLAG_RD,
 	    &rack_to_arm_tlp,
 	    "Total number of times the tlp timer armed?");
 	rack_paced_segments = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "paced", CTLFLAG_RD,
 	    &rack_paced_segments,
 	    "Total number of times a segment send caused hptsi");
 	rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "unpaced", CTLFLAG_RD,
 	    &rack_unpaced_segments,
 	    "Total number of times a segment did not cause hptsi");
 	rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 	    &rack_saw_enobuf,
 	    "Total number of times a segment did not cause hptsi");
 	rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 	    &rack_saw_enetunreach,
 	    "Total number of times a segment did not cause hptsi");
 	rack_to_alloc = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "allocs", CTLFLAG_RD,
 	    &rack_to_alloc,
 	    "Total allocations of tracking structures");
 	rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "allochard", CTLFLAG_RD,
 	    &rack_to_alloc_hard,
 	    "Total allocations done with sleeping the hard way");
 	rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "allocemerg", CTLFLAG_RD,
 	    &rack_to_alloc_emerg,
 	    "Total alocations done from emergency cache");
 	rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sack_long", CTLFLAG_RD,
 	    &rack_sack_proc_all,
 	    "Total times we had to walk whole list for sack processing");
 
 	rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sack_restart", CTLFLAG_RD,
 	    &rack_sack_proc_restart,
 	    "Total times we had to walk whole list due to a restart");
 	rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sack_short", CTLFLAG_RD,
 	    &rack_sack_proc_short,
 	    "Total times we took shortcut for sack processing");
 	rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 	    &rack_enter_tlp_calc,
 	    "Total times we called calc-tlp");
 	rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 	    &rack_used_tlpmethod,
 	    "Total number of runt sacks");
 	rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 	    &rack_used_tlpmethod2,
 	    "Total number of runt sacks 2");
 	rack_runt_sacks = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "runtsacks", CTLFLAG_RD,
 	    &rack_runt_sacks,
 	    "Total number of runt sacks");
 	rack_progress_drops = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prog_drops", CTLFLAG_RD,
 	    &rack_progress_drops,
 	    "Total number of progress drops");
 	rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 	    &rack_input_idle_reduces,
 	    "Total number of idle reductions on input");
 	rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_nada", CTLFLAG_RD,
 	    &rack_tlp_does_nada,
 	    "Total number of nada tlp calls");
 	COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "outsize", CTLFLAG_RD,
 	    rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 	COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "opts", CTLFLAG_RD,
 	    rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 	SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 }
 
 static inline int32_t
 rack_progress_timeout_check(struct tcpcb *tp)
 {
 	if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 		if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 			/*
 			 * There is an assumption that the caller
 			 * will drop the connection so we will
 			 * increment the counters here.
 			 */
 			struct tcp_rack *rack;
 			rack = (struct tcp_rack *)tp->t_fb_ptr;
 			counter_u64_add(rack_progress_drops, 1);
 #ifdef NETFLIX_STATS
 			TCPSTAT_INC(tcps_progdrops);
 #endif
 			rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
 			return (1);
 		}
 	}
 	return (0);
 }
 
 
 static void
 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
 		log.u_bbr.flex2 = to;
 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = slot;
 		log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex8 = which;
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERSTAR, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex8 = to_num;
 		log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 		log.u_bbr.flex2 = rack->rc_rack_rtt;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_RTO, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
     uint32_t o_srtt, uint32_t o_var)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = t;
 		log.u_bbr.flex2 = o_srtt;
 		log.u_bbr.flex3 = o_var;
 		log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
 		log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;		
 		log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
 		log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
 		log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 		TCP_LOG_EVENT(tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRRTT, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 {
 	/* 
 	 * Log the rtt sample we are
 	 * applying to the srtt algorithm in
 	 * useconds.
 	 */
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 		
 		/* Convert our ms to a microsecond */
 		log.u_bbr.flex1 = rtt * 1000;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_RTT, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 
 static inline void
 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
 {
 	if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = tick;
 		log.u_bbr.flex3 = tp->t_maxunacktime;
 		log.u_bbr.flex4 = tp->t_acktime;
 		log.u_bbr.flex8 = event;
 		TCP_LOG_EVENT(tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_PROGRESS, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRSND, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		log.u_bbr.flex1 = did_out;
 		log.u_bbr.flex2 = nxt_pkt;
 		log.u_bbr.flex3 = way_out;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex7 = rack->r_wanted_output;
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_DOSEG_DONE, 0,
 		    0, &log, false);
 	}
 }
 
 
 static void
 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex7 = hpts_calling;
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_JUSTRET, 0,
 		    tlen, &log, false);
 	}
 }
 
 static void
 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = 0;
 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = 0;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex8 = hpts_removed;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERCANC, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = timers;
 		log.u_bbr.flex2 = ret;
 		log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex5 = cts;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TO_PROCESS, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_counter_destroy()
 {
 	counter_u64_free(rack_badfr);
 	counter_u64_free(rack_badfr_bytes);
 	counter_u64_free(rack_rtm_prr_retran);
 	counter_u64_free(rack_rtm_prr_newdata);
 	counter_u64_free(rack_timestamp_mismatch);
 	counter_u64_free(rack_reorder_seen);
 	counter_u64_free(rack_tlp_tot);
 	counter_u64_free(rack_tlp_newdata);
 	counter_u64_free(rack_tlp_retran);
 	counter_u64_free(rack_tlp_retran_bytes);
 	counter_u64_free(rack_tlp_retran_fail);
 	counter_u64_free(rack_to_tot);
 	counter_u64_free(rack_to_arm_rack);
 	counter_u64_free(rack_to_arm_tlp);
 	counter_u64_free(rack_paced_segments);
 	counter_u64_free(rack_unpaced_segments);
 	counter_u64_free(rack_saw_enobuf);
 	counter_u64_free(rack_saw_enetunreach);
 	counter_u64_free(rack_to_alloc_hard);
 	counter_u64_free(rack_to_alloc_emerg);
 	counter_u64_free(rack_sack_proc_all);
 	counter_u64_free(rack_sack_proc_short);
 	counter_u64_free(rack_sack_proc_restart);
 	counter_u64_free(rack_to_alloc);
 	counter_u64_free(rack_find_high);
 	counter_u64_free(rack_runt_sacks);
 	counter_u64_free(rack_enter_tlp_calc);
 	counter_u64_free(rack_used_tlpmethod);
 	counter_u64_free(rack_used_tlpmethod2);
 	counter_u64_free(rack_progress_drops);
 	counter_u64_free(rack_input_idle_reduces);
 	counter_u64_free(rack_tlp_does_nada);
 	COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
 	COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
 }
 
 static struct rack_sendmap *
 rack_alloc(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	counter_u64_add(rack_to_alloc, 1);
 	rack->r_ctl.rc_num_maps_alloced++;
 	rsm = uma_zalloc(rack_zone, M_NOWAIT);
 	if (rsm) {
 		return (rsm);
 	}
 	if (rack->rc_free_cnt) {
 		counter_u64_add(rack_to_alloc_emerg, 1);
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
 		rack->rc_free_cnt--;
 		return (rsm);
 	}
 	return (NULL);
 }
 
 static void
 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	rack->r_ctl.rc_num_maps_alloced--;
 	if (rack->r_ctl.rc_tlpsend == rsm)
 		rack->r_ctl.rc_tlpsend = NULL;
 	if (rack->r_ctl.rc_next == rsm)
 		rack->r_ctl.rc_next = NULL;
 	if (rack->r_ctl.rc_sacklast == rsm)
 		rack->r_ctl.rc_sacklast = NULL;
 	if (rack->rc_free_cnt < rack_free_cache) {
 		memset(rsm, 0, sizeof(struct rack_sendmap));
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
 		rack->rc_free_cnt++;
 		return;
 	}
 	uma_zfree(rack_zone, rsm);
 }
 
 /*
  * CC wrapper hook functions
  */
 static void
 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
     uint16_t type, int32_t recovery)
 {
 #ifdef NETFLIX_STATS
 	int32_t gput;
 #endif
 #ifdef NETFLIX_CWV
 	u_long old_cwnd = tp->snd_cwnd;
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	tp->ccv->nsegs = nsegs;
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
 		uint32_t max;
 
 		max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
 		if (tp->ccv->bytes_this_ack > max) {
 			tp->ccv->bytes_this_ack = max;
 		}
 	}
 	if (tp->snd_cwnd <= tp->snd_wnd)
 		tp->ccv->flags |= CCF_CWND_LIMITED;
 	else
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
 #ifdef NETFLIX_STATS
 		stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
 		    ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
 		if ((tp->t_flags & TF_GPUTINPROG) &&
 		    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
 			gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
 			    max(1, tcp_ts_getticks() - tp->gput_ts);
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
 			    gput);
 			/*
 			 * XXXLAS: This is a temporary hack, and should be
 			 * chained off VOI_TCP_GPUT when stats(9) grows an
 			 * API to deal with chained VOIs.
 			 */
 			if (tp->t_stats_gput_prev > 0)
 				stats_voi_update_abs_s32(tp->t_stats,
 				    VOI_TCP_GPUT_ND,
 				    ((gput - tp->t_stats_gput_prev) * 100) /
 				    tp->t_stats_gput_prev);
 			tp->t_flags &= ~TF_GPUTINPROG;
 			tp->t_stats_gput_prev = gput;
 
 			if (tp->t_maxpeakrate) {
 				/*
 				 * We update t_peakrate_thr. This gives us roughly
 				 * one update per round trip time.
 				 */
 				tcp_update_peakrate_thr(tp);
 			}
 		}
 #endif
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			    nsegs * V_tcp_abc_l_var * tp->t_maxseg);
 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
 				tp->t_bytes_acked -= tp->snd_cwnd;
 				tp->ccv->flags |= CCF_ABC_SENTAWND;
 			}
 		} else {
 			tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 			tp->t_bytes_acked = 0;
 		}
 	}
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
 #ifdef NETFLIX_STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
 #endif
 	if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
 		rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
 	}
 #ifdef NETFLIX_CWV
 	if (tp->cwv_enabled) {
 		/*
 		 * Per RFC 7661: The behaviour in the non-validated phase is
 		 * specified as: o  A sender determines whether to increase
 		 * the cwnd based upon whether it is cwnd-limited (see
 		 * Section 4.5.3): * A sender that is cwnd-limited MAY use
 		 * the standard TCP method to increase cwnd (i.e., the
 		 * standard method permits a TCP sender that fully utilises
 		 * the cwnd to increase the cwnd each time it receives an
 		 * ACK). * A sender that is not cwnd-limited MUST NOT
 		 * increase the cwnd when ACK packets are received in this
 		 * phase (i.e., needs to avoid growing the cwnd when it has
 		 * not recently sent using the current size of cwnd).
 		 */
 		if ((tp->snd_cwnd > old_cwnd) &&
 		    (tp->cwv_cwnd_valid == 0) &&
 		    (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
 			tp->snd_cwnd = old_cwnd;
 		}
 		/* Try to update pipeAck and NCWV state */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    !IN_RECOVERY(tp->t_flags)) {
 			uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
 
 			tcp_newcwv_update_pipeack(tp, data);
 		}
 	}
 #endif
 	/* we enforce max peak rate if it is set. */
 	if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
 		tp->snd_cwnd = tp->t_peakrate_thr;
 	}
 }
 
 static void
 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
 {
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (rack->r_ctl.rc_prr_sndcnt > 0)
 		rack->r_wanted_output++;
 }
 
 static void
 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	struct tcp_rack *rack;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 	}
 	/*
 	 * Here we can in theory adjust cwnd to be based on the number of
 	 * losses in the window (rack->r_ctl.rc_loss_count). This is done
 	 * based on the rack_use_proportional flag.
 	 */
 	if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
 		int32_t reduce;
 
 		reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
 		if (reduce > 50) {
 			reduce = 50;
 		}
 		tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
 	} else {
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			/* Drop us down to the ssthresh (1/2 cwnd at loss) */
 			tp->snd_cwnd = tp->snd_ssthresh;
 		}
 	}
 	if (rack->r_ctl.rc_prr_sndcnt > 0) {
 		/* Suck the next prr cnt back into cwnd */
 		tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
 		rack->r_ctl.rc_prr_sndcnt = 0;
 	}
 	EXIT_RECOVERY(tp->t_flags);
 
 
 #ifdef NETFLIX_CWV
 	if (tp->cwv_enabled) {
 		if ((tp->cwv_cwnd_valid == 0) &&
 		    (tp->snd_cwv.in_recovery))
 			tcp_newcwv_end_recovery(tp);
 	}
 #endif
 }
 
 static void
 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 {
 	struct tcp_rack *rack;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	switch (type) {
 	case CC_NDUPACK:
 /*		rack->r_ctl.rc_ssthresh_set = 1;*/
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			rack->r_ctl.rc_tlp_rtx_out = 0;
 			rack->r_ctl.rc_prr_delivered = 0;
 			rack->r_ctl.rc_prr_out = 0;
 			rack->r_ctl.rc_loss_count = 0;
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 			rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags)) {
 			TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 		    tp->t_maxseg) * tp->t_maxseg;
 		tp->snd_cwnd = tp->t_maxseg;
 		break;
 	case CC_RTO_ERR:
 		TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY)
 			ENTER_FASTRECOVERY(tp->t_flags);
 		if (tp->t_flags & TF_WASCRECOVERY)
 			ENTER_CONGRECOVERY(tp->t_flags);
 		tp->snd_nxt = tp->snd_max;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
 	}
 #ifdef NETFLIX_CWV
 	if (tp->cwv_enabled) {
 		if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
 			tcp_newcwv_enter_recovery(tp);
 		}
 		if (type == CC_RTO) {
 			tcp_newcwv_reset(tp);
 		}
 	}
 #endif
 }
 
 
 
 static inline void
 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
 {
 	uint32_t i_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef NETFLIX_STATS
 	TCPSTAT_INC(tcps_idle_restarts);
 	if (tp->t_state == TCPS_ESTABLISHED)
 		TCPSTAT_INC(tcps_idle_estrestarts);
 #endif
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 
 	if (tp->snd_cwnd == 1)
 		i_cwnd = tp->t_maxseg;		/* SYN(-ACK) lost */
 	else if (V_tcp_initcwnd_segments)
 		i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
 		    max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
 	else if (V_tcp_do_rfc3390)
 		i_cwnd = min(4 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (tp->t_maxseg > 2190)
 			i_cwnd = 2 * tp->t_maxseg;
 		else if (tp->t_maxseg > 1095)
 			i_cwnd = 3 * tp->t_maxseg;
 		else
 			i_cwnd = 4 * tp->t_maxseg;
 	}
 	if (reduce_largest) {
 		/*
 		 * Do we reduce the largest cwnd to make 
 		 * rack play nice on restart hptsi wise?
 		 */
 		if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
 			((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
 	}
 	/*
 	 * Being idle is no differnt than the initial window. If the cc
 	 * clamps it down below the initial window raise it to the initial
 	 * window.
 	 */
 	if (tp->snd_cwnd < i_cwnd) {
 		tp->snd_cwnd = i_cwnd;
 	}
 }
 
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  *	- Delayed acks are enabled or this is a half-synchronized T/TCP
  *	  connection.
  */
 #define DELAY_ACK(tp, tlen)			 \
 	(((tp->t_flags & TF_RXWIN0SENT) == 0) && \
 	((tp->t_flags & TF_DELACK) == 0) && 	 \
 	(tlen <= tp->t_maxseg) &&		 \
 	(tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
 
 static inline void
 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
 {
 	int32_t win;
 
 	/*
 	 * Calculate amount of space in receive window, and then do TCP
 	 * input processing. Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 }
 
 static void
 rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked)
 {
 	if (*ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 	}
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	if (m)
 		m_freem(m);
 }
 
 static void
 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen)
 {
 	if (*ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 	}
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 }
 
 /*
  * The value in ret_val informs the caller
  * if we dropped the tcb (and lock) or not.
  * 1 = we dropped it, 0 = the TCB is still locked
  * and valid.
  */
 static void
 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val)
 {
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies sequence
 	 * space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all paths to this
 	 * code happen after packets containing RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the segment
 	 * we received passes the SYN-RECEIVED ACK test. If it fails send a
 	 * RST.  This breaks the loop in the "LAND" DoS attack, and also
 	 * prevents an ACK storm between two listening ports that have been
 	 * sent forged SYN segments, each with the source address of the
 	 * other.
 	 */
 	struct tcp_rack *rack;
 
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		*ret_val = 1;
 		rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 		return;
 	} else
 		*ret_val = 0;
 	if (*ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack->r_wanted_output++;
 	tp->t_flags |= TF_ACKNOW;
 	if (m)
 		m_freem(m);
 }
 
 
 static int
 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked)
 {
 	/*
 	 * RFC5961 Section 3.2
 	 *
 	 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
 	 * window, we send challenge ACK.
 	 *
 	 * Note: to take into account delayed ACKs, we should test against
 	 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
 	 * of closed window, not covered by the RFC.
 	 */
 	int dropped = 0;
 
 	if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		KASSERT(*ti_locked == TI_RLOCKED,
 		    ("%s: TH_RST ti_locked %d, th %p tp %p",
 		    __func__, *ti_locked, th, tp));
 		KASSERT(tp->t_state != TCPS_SYN_SENT,
 		    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 		    __func__, th, tp));
 
 		if (V_tcp_insecure_rst ||
 		    (tp->last_ack_sent == th->th_seq) ||
 		    (tp->rcv_nxt == th->th_seq) ||
 		    ((tp->last_ack_sent - 1) == th->th_seq)) {
 			TCPSTAT_INC(tcps_drops);
 			/* Drop the connection. */
 			switch (tp->t_state) {
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 			case TCPS_ESTABLISHED:
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				so->so_error = ECONNRESET;
 		close:
 				tcp_state_change(tp, TCPS_CLOSED);
 				/* FALLTHROUGH */
 			default:
 				tp = tcp_close(tp);
 			}
 			dropped = 1;
 			rack_do_drop(m, tp, ti_locked);
 		} else {
 			TCPSTAT_INC(tcps_badrst);
 			/* Send challenge ACK. */
 			tcp_respond(tp, mtod(m, void *), th, m,
 			    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 			tp->last_ack_sent = tp->rcv_nxt;
 		}
 	} else {
 		m_freem(m);
 	}
 	return (dropped);
 }
 
 /*
  * The value in ret_val informs the caller
  * if we dropped the tcb (and lock) or not.
  * 1 = we dropped it, 0 = the TCB is still locked
  * and valid.
  */
 static void
 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val)
 {
 	KASSERT(*ti_locked == TI_RLOCKED,
 	    ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked));
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
 	TCPSTAT_INC(tcps_badsyn);
 	if (V_tcp_insecure_syn &&
 	    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 		tp = tcp_drop(tp, ECONNRESET);
 		*ret_val = 1;
 		rack_do_drop(m, tp, ti_locked);
 	} else {
 		/* Send challenge ACK. */
 		tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 		    tp->snd_nxt, TH_ACK);
 		tp->last_ack_sent = tp->rcv_nxt;
 		m = NULL;
 		*ret_val = 0;
 		rack_do_drop(m, NULL, ti_locked);
 	}
 }
 
 /*
  * rack_ts_check returns 1 for you should not proceed. It places
  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
  * that the TCB is unlocked and probably dropped. The 0 indicates the
  * TCB is still valid and locked.
  */
 static int
 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val)
 {
 
 	/* Check to see if ts_recent is over 24 days old.  */
 	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 		/*
 		 * Invalidate ts_recent.  If this segment updates ts_recent,
 		 * the age will be reset later and ts_recent will get a
 		 * valid value.  If it does not, setting ts_recent to zero
 		 * will at least satisfy the requirement that zero be placed
 		 * in the timestamp echo reply when ts_recent isn't valid.
 		 * The age isn't reset until we get a valid ts_recent
 		 * because we don't want out-of-order segments to be dropped
 		 * when ts_recent is old.
 		 */
 		tp->ts_recent = 0;
 	} else {
 		TCPSTAT_INC(tcps_rcvduppack);
 		TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 		TCPSTAT_INC(tcps_pawsdrop);
 		*ret_val = 0;
 		if (tlen) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 		}
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * rack_drop_checks returns 1 for you should not proceed. It places
  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
  * that the TCB is unlocked and probably dropped. The 0 indicates the
  * TCB is still valid and locked.
  */
 static int
 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
 {
 	int32_t todrop;
 	int32_t thflags;
 	int32_t tlen;
 
 	thflags = *thf;
 	tlen = *tlenp;
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		*drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 	/*
 	 * If segment ends after window, drop trailing data (and PUSH and
 	 * FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment and
 			 * ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				TCPSTAT_INC(tcps_rcvwinprobe);
 			} else {
 				rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val);
 				return (1);
 			}
 		} else
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH | TH_FIN);
 	}
 	*thf = thflags;
 	*tlenp = tlen;
 	return (0);
 }
 
 static struct rack_sendmap *
 rack_find_lowest_rsm(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	/*
 	 * Walk the time-order transmitted list looking for an rsm that is
 	 * not acked. This will be the one that was sent the longest time
 	 * ago that is still outstanding.
 	 */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
 		if (rsm->r_flags & RACK_ACKED) {
 			continue;
 		}
 		goto finish;
 	}
 finish:
 	return (rsm);
 }
 
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	struct rack_sendmap *prsm;
 
 	/*
 	 * Walk the sequence order list backward until we hit and arrive at
 	 * the highest seq not acked. In theory when this is called it
 	 * should be the last segment (which it was not).
 	 */
 	counter_u64_add(rack_find_high, 1);
 	prsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
 		if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
 			continue;
 		}
 		return (prsm);
 	}
 	return (NULL);
 }
 
 
 static uint32_t
 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
 {
 	int32_t lro;
 	uint32_t thresh;
 
 	/*
 	 * lro is the flag we use to determine if we have seen reordering.
 	 * If it gets set we have seen reordering. The reorder logic either
 	 * works in one of two ways:
 	 *
 	 * If reorder-fade is configured, then we track the last time we saw
 	 * re-ordering occur. If we reach the point where enough time as
 	 * passed we no longer consider reordering has occuring.
 	 *
 	 * Or if reorder-face is 0, then once we see reordering we consider
 	 * the connection to alway be subject to reordering and just set lro
 	 * to 1.
 	 *
 	 * In the end if lro is non-zero we add the extra time for
 	 * reordering in.
 	 */
 	if (srtt == 0)
 		srtt = 1;
 	if (rack->r_ctl.rc_reorder_ts) {
 		if (rack->r_ctl.rc_reorder_fade) {
 			if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
 				lro = cts - rack->r_ctl.rc_reorder_ts;
 				if (lro == 0) {
 					/*
 					 * No time as passed since the last
 					 * reorder, mark it as reordering.
 					 */
 					lro = 1;
 				}
 			} else {
 				/* Negative time? */
 				lro = 0;
 			}
 			if (lro > rack->r_ctl.rc_reorder_fade) {
 				/* Turn off reordering seen too */
 				rack->r_ctl.rc_reorder_ts = 0;
 				lro = 0;
 			}
 		} else {
 			/* Reodering does not fade */
 			lro = 1;
 		}
 	} else {
 		lro = 0;
 	}
 	thresh = srtt + rack->r_ctl.rc_pkt_delay;
 	if (lro) {
 		/* It must be set, if not you get 1/4 rtt */
 		if (rack->r_ctl.rc_reorder_shift)
 			thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
 		else
 			thresh += (srtt >> 2);
 	} else {
 		thresh += 1;
 	}
 	/* We don't let the rack timeout be above a RTO */
 	
 	if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
 		thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
 	}
 	/* And we don't want it above the RTO max either */
 	if (thresh > rack_rto_max) {
 		thresh = rack_rto_max;
 	}
 	return (thresh);
 }
 
 static uint32_t
 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
 		     struct rack_sendmap *rsm, uint32_t srtt)
 {
 	struct rack_sendmap *prsm;
 	uint32_t thresh, len;
 	int maxseg;
 	
 	if (srtt == 0)
 		srtt = 1;
 	if (rack->r_ctl.rc_tlp_threshold)
 		thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
 	else
 		thresh = (srtt * 2);
 	
 	/* Get the previous sent packet, if any  */
 	maxseg = tcp_maxseg(tp);
 	counter_u64_add(rack_enter_tlp_calc, 1);
 	len = rsm->r_end - rsm->r_start;
 	if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
 		/* Exactly like the ID */
 		if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
 			uint32_t alt_thresh;
 			/*
 			 * Compensate for delayed-ack with the d-ack time.
 			 */
 			counter_u64_add(rack_used_tlpmethod, 1);
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
 		/* 2.1 behavior */
 		prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
 		if (prsm && (len <= maxseg)) {
 			/*
 			 * Two packets outstanding, thresh should be (2*srtt) +
 			 * possible inter-packet delay (if any).
 			 */
 			uint32_t inter_gap = 0;
 			int idx, nidx;
 			
 			counter_u64_add(rack_used_tlpmethod, 1);
 			idx = rsm->r_rtr_cnt - 1;
 			nidx = prsm->r_rtr_cnt - 1;
 			if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
 				/* Yes it was sent later (or at the same time) */
 				inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
 			}
 			thresh += inter_gap;
 		} else 	if (len <= maxseg) {
 			/*
 			 * Possibly compensate for delayed-ack.
 			 */
 			uint32_t alt_thresh;
 			
 			counter_u64_add(rack_used_tlpmethod2, 1);
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
 		/* 2.2 behavior */
 		if (len <= maxseg) {
 			uint32_t alt_thresh;
 			/*
 			 * Compensate for delayed-ack with the d-ack time.
 			 */
 			counter_u64_add(rack_used_tlpmethod, 1);
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	}
  	/* Not above an RTO */
 	if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
 		thresh = TICKS_2_MSEC(tp->t_rxtcur);
 	}
 	/* Not above a RTO max */
 	if (thresh > rack_rto_max) {
 		thresh = rack_rto_max;
 	}
 	/* Apply user supplied min TLP */
 	if (thresh < rack_tlp_min) {
 		thresh = rack_tlp_min;
 	}
 	return (thresh);
 }
 
 static struct rack_sendmap *
 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
 {
 	/*
 	 * Check to see that we don't need to fall into recovery. We will
 	 * need to do so if our oldest transmit is past the time we should
 	 * have had an ack.
 	 */
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	int32_t idx;
 	uint32_t srtt_cur, srtt, thresh;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
 		return (NULL);
 	}
 	srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
 	srtt = TICKS_2_MSEC(srtt_cur);
 	if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
 		srtt = rack->rc_rack_rtt;
 
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm == NULL)
 		return (NULL);
 
 	if (rsm->r_flags & RACK_ACKED) {
 		rsm = rack_find_lowest_rsm(rack);
 		if (rsm == NULL)
 			return (NULL);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
 	if (tsused < rsm->r_tim_lastsent[idx]) {
 		return (NULL);
 	}
 	if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
 		return (NULL);
 	}
 	/* Ok if we reach here we are over-due */
 	rack->r_ctl.rc_rsm_start = rsm->r_start;
 	rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 	rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 	rack_cong_signal(tp, NULL, CC_NDUPACK);
 	return (rsm);
 }
 
 static uint32_t
 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t t;
 	int32_t tt;
 	uint32_t ret_val;
 
 	t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 	    tcp_persmin, tcp_persmax);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 	rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
 	ret_val = (uint32_t)tt;
 	return (ret_val);
 }
 
 static uint32_t
 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * Start the FR timer, we do this based on getting the first one in
 	 * the rc_tmap. Note that if its NULL we must stop the timer. in all
 	 * events we need to stop the running timer (if its running) before
 	 * starting the new one.
 	 */
 	uint32_t thresh, exp, to, srtt, time_since_sent;
 	uint32_t srtt_cur;
 	int32_t idx;
 	int32_t is_tlp_timer = 0;
 	struct rack_sendmap *rsm;
 	
 	if (rack->t_timers_stopped) {
 		/* All timers have been stopped none are to run */
 		return (0);
 	}
 	if (rack->rc_in_persist) {
 		/* We can't start any timer in persists */
 		return (rack_get_persists_timer_val(tp, rack));
 	}
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto activate_rxt;
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm == NULL) {
 		/* Nothing on the send map */
 activate_rxt:
 		if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
 			to = TICKS_2_MSEC(tp->t_rxtcur);
 			if (to == 0)
 				to = 1;
 			return (to);
 		}
 		return (0);
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		rsm = rack_find_lowest_rsm(rack);
 		if (rsm == NULL) {
 			/* No lowest? */
 			goto activate_rxt;
 		}
 	}
 	/* Convert from ms to usecs */
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & RACK_HAS_FIN)) {
 			/*
 			 * We don't start a rack timer if all we have is a
 			 * FIN outstanding.
 			 */
 			goto activate_rxt;
 		}
 		if (tp->t_srtt) {
 			srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
 			srtt = TICKS_2_MSEC(srtt_cur);
 		} else
 			srtt = RACK_INITIAL_RTO;
 
 		thresh = rack_calc_thresh_rack(rack, srtt, cts);
 		idx = rsm->r_rtr_cnt - 1;
 		exp = rsm->r_tim_lastsent[idx] + thresh;
 		if (SEQ_GEQ(exp, cts)) {
 			to = exp - cts;
 			if (to < rack->r_ctl.rc_min_to) {
 				to = rack->r_ctl.rc_min_to;
 			}
 		} else {
 			to = rack->r_ctl.rc_min_to;
 		}
 	} else {
 		/* Ok we need to do a TLP not RACK */
 		if ((rack->rc_tlp_in_progress != 0) ||
 		    (rack->r_ctl.rc_tlp_rtx_out != 0)) {
 			/*
 			 * The previous send was a TLP or a tlp_rtx is in
 			 * process.
 			 */
 			goto activate_rxt;
 		}
 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
 		if (rsm == NULL) {
 			/* We found no rsm to TLP with. */
 			goto activate_rxt;
 		}
 		if (rsm->r_flags & RACK_HAS_FIN) {
 			/* If its a FIN we dont do TLP */
 			rsm = NULL;
 			goto activate_rxt;
 		}
 		idx = rsm->r_rtr_cnt - 1;
 		if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx])) 
 			time_since_sent = cts - rsm->r_tim_lastsent[idx];
 		else
 			time_since_sent = 0;
 		is_tlp_timer = 1;
 		if (tp->t_srtt) {
 			srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
 			srtt = TICKS_2_MSEC(srtt_cur);
 		} else
 			srtt = RACK_INITIAL_RTO;
 		thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
 		if (thresh > time_since_sent)
 			to = thresh - time_since_sent;
 		else
 			to = rack->r_ctl.rc_min_to;
 		if (to > TCPTV_REXMTMAX) {
 			/*
 			 * If the TLP time works out to larger than the max
 			 * RTO lets not do TLP.. just RTO.
 			 */
 			goto activate_rxt;
 		}
 		if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
 			/*
 			 * The tail is no longer the last one I did a probe
 			 * on
 			 */
 			rack->r_ctl.rc_tlp_seg_send_cnt = 0;
 			rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
 		}
 	}
 	if (is_tlp_timer == 0) {
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
 	} else {
 		if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
 		    (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
 			/*
 			 * We have exceeded how many times we can retran the
 			 * current TLP timer, switch to the RTO timer.
 			 */
 			goto activate_rxt;
 		} else {
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
 		}
 	}
 	if (to == 0)
 		to = 1;
 	return (to);
 }
 
 static void
 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (rack->rc_in_persist == 0) {
 		if (((tp->t_flags & TF_SENTFIN) == 0) &&
 		    (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
 			/* Must need to send more data to enter persist */
 			return;
 		rack->r_ctl.rc_went_idle_time = cts;
 		rack_timer_cancel(tp, rack, cts, __LINE__);
 		tp->t_rxtshift = 0;
 		rack->rc_in_persist = 1;
 	}
 }
 
 static void
 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	if (rack->rc_inp->inp_in_hpts)  {
 		tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 		rack->r_ctl.rc_hpts_flags  = 0;
 	}
 	rack->rc_in_persist = 0;
 	rack->r_ctl.rc_went_idle_time = 0;
 	tp->t_flags &= ~TF_FORCEDATA;
 	tp->t_rxtshift = 0;
 }
 
 static void
 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
 {
 	struct inpcb *inp;
 	uint32_t delayed_ack = 0;
 	uint32_t hpts_timeout;
 	uint8_t stopped;
 	uint32_t left = 0;
 
 	inp = tp->t_inpcb;
 	if (inp->inp_in_hpts) {
 		/* A previous call is already set up */
 		return;
 	}
 	if (tp->t_state == TCPS_CLOSED) {
 		return;
 	}
 	stopped = rack->rc_tmr_stopped;
 	if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
 		left = rack->r_ctl.rc_timer_exp - cts;
 	}
 	rack->r_ctl.rc_timer_exp = 0;
 	if (rack->rc_inp->inp_in_hpts == 0) {
 		rack->r_ctl.rc_hpts_flags = 0;
 	} 
 	if (slot) {
 		/* We are hptsi too */
 		rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
 	} else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 		/* 
 		 * We are still left on the hpts when the to goes
 		 * it will be for output.
 		 */
 		if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
 			slot = cts - rack->r_ctl.rc_last_output_to;
 		else
 			slot = 1;
 	}
 	if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
 		/* No send window.. we must enter persist */
 		rack_enter_persist(tp, rack, cts);
 	} else if ((frm_out_sbavail &&
 		    (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
 		    (tp->snd_wnd < tp->t_maxseg)) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state)) {
 		/*
 		 * If we have no window or we can't send a segment (and have
 		 * data to send.. we cheat here and frm_out_sbavail is
 		 * passed in with the sbavail(sb) only from bbr_output) and
 		 * we are established, then we must enter persits (if not
 		 * already in persits).
 		 */
 		rack_enter_persist(tp, rack, cts);
 	}
 	hpts_timeout = rack_timer_start(tp, rack, cts);
 	if (tp->t_flags & TF_DELACK) {
 		delayed_ack = tcp_delacktime;
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
 	}
 	if (delayed_ack && ((hpts_timeout == 0) ||
 			    (delayed_ack < hpts_timeout)))
 		hpts_timeout = delayed_ack;
 	else 
 		rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	/*
 	 * If no timers are going to run and we will fall off the hptsi
 	 * wheel, we resort to a keep-alive timer if its configured.
 	 */
 	if ((hpts_timeout == 0) &&
 	    (slot == 0)) {
 		if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		    (tp->t_state <= TCPS_CLOSING)) {
 			/*
 			 * Ok we have no timer (persists, rack, tlp, rxt  or
 			 * del-ack), we don't have segments being paced. So
 			 * all that is left is the keepalive timer.
 			 */
 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 				/* Get the established keep-alive time */
 				hpts_timeout = TP_KEEPIDLE(tp);
 			} else {
 				/* Get the initial setup keep-alive time */
 				hpts_timeout = TP_KEEPINIT(tp);
 			}
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
 		}
 	}
 	if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
 	    (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
 		/*
 		 * RACK, TLP, persists and RXT timers all are restartable
 		 * based on actions input .. i.e we received a packet (ack
 		 * or sack) and that changes things (rw, or snd_una etc).
 		 * Thus we can restart them with a new value. For
 		 * keep-alive, delayed_ack we keep track of what was left
 		 * and restart the timer with a smaller value.
 		 */
 		if (left < hpts_timeout)
 			hpts_timeout = left;
 	}
 	if (hpts_timeout) {
 		/*
 		 * Hack alert for now we can't time-out over 2,147,483
 		 * seconds (a bit more than 596 hours), which is probably ok
 		 * :).
 		 */
 		if (hpts_timeout > 0x7ffffffe)
 			hpts_timeout = 0x7ffffffe;
 		rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
 	}
 	if (slot) {
 		rack->r_ctl.rc_last_output_to = cts + slot;
 		if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
 			if (rack->rc_inp->inp_in_hpts == 0)
 				tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
 			rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
 		} else {
 			/*
 			 * Arrange for the hpts to kick back in after the
 			 * t-o if the t-o does not cause a send.
 			 */
 			if (rack->rc_inp->inp_in_hpts == 0)
 				tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
 			rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
 		}
 	} else if (hpts_timeout) {
 		if (rack->rc_inp->inp_in_hpts == 0)
 			tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
 		rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
 	} else {
 		/* No timer starting */
 #ifdef INVARIANTS
 		if (SEQ_GT(tp->snd_max, tp->snd_una)) {
 			panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
 			    tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
 		}
 #endif
 	}
 	rack->rc_tmr_stopped = 0;
 	if (slot)
 		rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
 }
 
 /*
  * RACK Timer, here we simply do logging and house keeping.
  * the normal rack_output() function will call the
  * appropriate thing to check if we need to do a RACK retransmit.
  * We return 1, saying don't proceed with rack_output only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * This timer simply provides an internal trigger to send out data.
 	 * The check_recovery_mode call will see if there are needed
 	 * retransmissions, if so we will enter fast-recovery. The output
 	 * call may or may not do the same thing depending on sysctl
 	 * settings.
 	 */
 	struct rack_sendmap *rsm;
 	int32_t recovery;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	rack_log_to_event(rack, RACK_TO_FRM_RACK);
 	recovery = IN_RECOVERY(tp->t_flags);
 	counter_u64_add(rack_to_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	rsm = rack_check_recovery_mode(tp, cts);
 	if (rsm) {
 		uint32_t rtt;
 
 		rtt = rack->rc_rack_rtt;
 		if (rtt == 0)
 			rtt = 1;
 		if ((recovery == 0) &&
 		    (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
 			/*
 			 * The rack-timeout that enter's us into recovery
 			 * will force out one MSS and set us up so that we
 			 * can do one more send in 2*rtt (transitioning the
 			 * rack timeout into a rack-tlp).
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 		} else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
 		    ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
 			/*
 			 * When a rack timer goes, we have to send at 
 			 * least one segment. They will be paced a min of 1ms
 			 * apart via the next rack timer (or further
 			 * if the rack timer dictates it).
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 		}
 	} else {
 		/* This is a case that should happen rarely if ever */
 		counter_u64_add(rack_tlp_does_nada, 1);
 #ifdef TCP_BLACKBOX
 		tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
 #endif
 		rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
 	return (0);
 }
 
 /*
  * TLP Timer, here we simply setup what segment we want to
  * have the TLP expire on, the normal rack_output() will then
  * send it out.
  *
  * We return 1, saying don't proceed with rack_output only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * Tail Loss Probe.
 	 */
 	struct rack_sendmap *rsm = NULL;
 	struct socket *so;
 	uint32_t amm, old_prr_snd = 0;
 	uint32_t out, avail;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	if (rack_progress_timeout_check(tp)) {
 		tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 		return (1);
 	}
 	/*
 	 * A TLP timer has expired. We have been idle for 2 rtts. So we now
 	 * need to figure out how to force a full MSS segment out.
 	 */
 	rack_log_to_event(rack, RACK_TO_FRM_TLP);
 	counter_u64_add(rack_tlp_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	so = tp->t_inpcb->inp_socket;
 	avail = sbavail(&so->so_snd);
 	out = tp->snd_max - tp->snd_una;
 	rack->rc_timer_up = 1;
 	/*
 	 * If we are in recovery we can jazz out a segment if new data is
 	 * present simply by setting rc_prr_sndcnt to a segment.
 	 */
 	if ((avail > out) &&
 	    ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
 		/* New data is available */
 		amm = avail - out;
 		if (amm > tp->t_maxseg) {
 			amm = tp->t_maxseg;
 		} else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
 			/* not enough to fill a MTU and no-delay is off */
 			goto need_retran;
 		}
 		if (IN_RECOVERY(tp->t_flags)) {
 			/* Unlikely */
 			old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
 			if (out + amm <= tp->snd_wnd)
 				rack->r_ctl.rc_prr_sndcnt = amm;
 			else
 				goto need_retran;
 		} else {
 			/* Set the send-new override */
 			if (out + amm <= tp->snd_wnd)
 				rack->r_ctl.rc_tlp_new_data = amm;
 			else
 				goto need_retran;
 		}
 		rack->r_ctl.rc_tlp_seg_send_cnt = 0;
 		rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
 		rack->r_ctl.rc_tlpsend = NULL;
 		counter_u64_add(rack_tlp_newdata, 1);
 		goto send;
 	}
 need_retran:
 	/*
 	 * Ok we need to arrange the last un-acked segment to be re-sent, or
 	 * optionally the first un-acked segment.
 	 */
 	if (rack_always_send_oldest)
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	else {
 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
 		if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
 			rsm = rack_find_high_nonack(rack, rsm);
 		}
 	}
 	if (rsm == NULL) {
 		counter_u64_add(rack_tlp_does_nada, 1);
 #ifdef TCP_BLACKBOX
 		tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
 #endif
 		goto out;
 	}
 	if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
 		/*
 		 * We need to split this the last segment in two.
 		 */
 		int32_t idx;
 		struct rack_sendmap *nrsm;
 
 		nrsm = rack_alloc(rack);
 		if (nrsm == NULL) {
 			/*
 			 * No memory to split, we will just exit and punt
 			 * off to the RXT timer.
 			 */
 			counter_u64_add(rack_tlp_does_nada, 1);
 			goto out;
 		}
 		nrsm->r_start = (rsm->r_end - tp->t_maxseg);
 		nrsm->r_end = rsm->r_end;
 		nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 		nrsm->r_flags = rsm->r_flags;
 		nrsm->r_sndcnt = rsm->r_sndcnt;
 		nrsm->r_rtr_bytes = 0;
 		rsm->r_end = nrsm->r_start;
 		for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 			nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 		}
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm->r_flags &= (~RACK_HAS_FIN);
 		rsm = nrsm;
 	}
 	rack->r_ctl.rc_tlpsend = rsm;
 	rack->r_ctl.rc_tlp_rtx_out = 1;
 	if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
 		rack->r_ctl.rc_tlp_seg_send_cnt++;
 		tp->t_rxtshift++;
 	} else {
 		rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
 		rack->r_ctl.rc_tlp_seg_send_cnt = 1;
 	}
 send:
 	rack->r_ctl.rc_tlp_send_cnt++;
 	if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
 		/*
 		 * Can't [re]/transmit a segment we have not heard from the
 		 * peer in max times. We need the retransmit timer to take
 		 * over.
 		 */
 restore:
 		rack->r_ctl.rc_tlpsend = NULL;
 		if (rsm)
 			rsm->r_flags &= ~RACK_TLP;
 		rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
 		counter_u64_add(rack_tlp_retran_fail, 1);
 		goto out;
 	} else if (rsm) {
 		rsm->r_flags |= RACK_TLP;
 	}
 	if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
 	    (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
 		/*
 		 * We don't want to send a single segment more than the max
 		 * either.
 		 */
 		goto restore;
 	}
 	rack->r_timer_override = 1;
 	rack->r_tlp_running = 1;
 	rack->rc_tlp_in_progress = 1;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 out:
 	rack->rc_timer_up = 0;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 }
 
 /*
  * Delayed ack Timer, here we simply need to setup the
  * ACK_NOW flag and remove the DELACK flag. From there
  * the output routine will send the ack out.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	rack_log_to_event(rack, RACK_TO_FRM_DELACK);
 	tp->t_flags &= ~TF_DELACK;
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	return (0);
 }
 
 /*
  * Persists timer, here we simply need to setup the
  * FORCE-DATA flag the output routine will send
  * the one byte send.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	struct inpcb *inp;
 	int32_t retval = 0;
 
 	inp = tp->t_inpcb;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (rack->rc_in_persist == 0)
 		return (0);
 	if (rack_progress_timeout_check(tp)) {
 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
 		return (1);
 	}
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	/*
 	 * Persistence timer into zero window. Force a byte to be output, if
 	 * possible.
 	 */
 	TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not time out if the
 	 * window is closed.  After a full backoff, drop the connection if
 	 * the idle time (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		TCPSTAT_INC(tcps_persistdrop);
 		retval = 1;
 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 		goto out;
 	}
 	if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
 	    tp->snd_una == tp->snd_max)
 		rack_exit_persist(tp, rack);
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		retval = 1;
 		TCPSTAT_INC(tcps_persistdrop);
 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 		goto out;
 	}
 	tp->t_flags |= TF_FORCEDATA;
 out:
 	rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
 	return (retval);
 }
 
 /*
  * If a keepalive goes off, we had no other timers
  * happening. We always return 1 here since this
  * routine either drops the connection or sends
  * out a segment with respond.
  */
 static int
 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
 	inp = tp->t_inpcb;
 	rack_log_to_event(rack, RACK_TO_FRM_KEEP);
 	/*
 	 * Keep-alive timer went off; send something or drop connection if
 	 * idle for too long.
 	 */
 	TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
 	if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response if the peer is
 		 * up and reachable: either an ACK if the connection is
 		 * still alive, or an RST if the peer has closed the
 		 * connection due to timeout or reboot. Using sequence
 		 * number tp->snd_una-1 causes the transmitted zero-length
 		 * segment to lie outside the receive window; by the
 		 * protocol spec, this requires the correspondent TCP to
 		 * respond.
 		 */
 		TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 	}
 	rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
 	return (1);
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 	return (1);
 }
 
 /*
  * Retransmit helper function, clear up all the ack
  * flags and take care of important book keeping.
  */
 static void
 rack_remxt_tmr(struct tcpcb *tp)
 {
 	/*
 	 * The retransmit timer went off, all sack'd blocks must be
 	 * un-acked.
 	 */
 	struct rack_sendmap *rsm, *trsm = NULL;
 	struct tcp_rack *rack;
 	int32_t cnt = 0;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
 	rack_log_to_event(rack, RACK_TO_FRM_TMR);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	/*
 	 * Ideally we would like to be able to
 	 * mark SACK-PASS on anything not acked here.
 	 * However, if we do that we would burst out
 	 * all that data 1ms apart. This would be unwise,
 	 * so for now we will just let the normal rxt timer
 	 * and tlp timer take care of it.
 	 */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
 		if (rsm->r_flags & RACK_ACKED) {
 			cnt++;
 			rsm->r_sndcnt = 0;
 			if (rsm->r_in_tmap == 0) {
 				/* We must re-add it back to the tlist */
 				if (trsm == NULL) {
 					TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				} else {
 					TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
 				}
 				rsm->r_in_tmap = 1;
 				trsm = rsm;
 			}
 		}
 		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
 	}
 	/* Clear the count (we just un-acked them) */
 	rack->r_ctl.rc_sacked = 0;
 	/* Clear the tlp rtx mark */
 	rack->r_ctl.rc_tlp_rtx_out = 0;
 	rack->r_ctl.rc_tlp_seg_send_cnt = 0;
 	rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
 	/* Setup so we send one segment */
 	if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
 		rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 	rack->r_timer_override = 1;
 }
 
 /*
  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
  * we will setup to retransmit the lowest seq number outstanding.
  */
 static int
 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	int32_t rexmt;
 	struct inpcb *inp;
 	int32_t retval = 0;
 
 	inp = tp->t_inpcb;
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (rack_progress_timeout_check(tp)) {
 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
 		return (1);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->snd_una == tp->snd_max)) {
 		/* Nothing outstanding .. nothing to do */
 		return (0);
 	}
 	/*
 	 * Retransmission timer went off.  Message has not been acked within
 	 * retransmit interval.  Back off to a longer retransmit interval
 	 * and retransmit one segment.
 	 */
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		TCPSTAT_INC(tcps_timeoutdrop);
 		retval = 1;
 		tcp_set_inp_to_drop(rack->rc_inp,
 		    (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
 		goto out;
 	}
 	rack_remxt_tmr(tp);
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be limited
 		 * to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can be
 		 * recovered if this turns out to be a "bad" retransmit. A
 		 * retransmit is considered "bad" if an ACK for this segment
 		 * is received within RTT/2 interval; the assumption here is
 		 * that the ACK was already in flight.  See "On Estimating
 		 * End-to-End Network Path Properties" by Allman and Paxson
 		 * for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		if (IN_FASTRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASFRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		if (IN_CONGRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASCRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 		tp->t_flags |= TF_PREVVALID;
 	} else
 		tp->t_flags &= ~TF_PREVVALID;
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 	   max(MSEC_2_TICKS(rack_rto_min), rexmt),
 	   MSEC_2_TICKS(rack_rto_max));
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple
 	 * of packets and process straight to FIN. In that case we won't
 	 * catch ESTABLISHED state.
 	 */
 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
 #ifdef INET6
 		int32_t isipv6;
 #endif
 
 		/*
 		 * Idea here is that at each stage of mtu probe (usually,
 		 * 1448 -> 1188 -> 524) should be given 2 chances to recover
 		 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
 		 * should take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
 		    tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism: -
 			 * Disable Path MTU Discovery (IP "DF" bit). -
 			 * Reduce MTU to lower value than what we negotiated
 			 * with peer.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
 				/* Record that we may have found a black hole. */
 				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 				/* Keep track of previous MSS. */
 				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 			}
 
 			/*
 			 * Reduce the MSS to blackhole value or to the
 			 * default in an attempt to retransmit.
 			 */
 #ifdef INET6
 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole
 			 * and we restore the previous MSS and blackhole
 			 * detection flags. The limit '6' is determined by
 			 * giving each probe stage (1448, 1188, 524) 2
 			 * chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift >= 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				TCPSTAT_INC(tcps_pmtud_blackhole_failed);
 			}
 		}
 	}
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to our
 	 * third SYN to work-around some broken terminal servers (most of
 	 * which have hopefully been retired) that have bad VJ header
 	 * compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current retransmit
 	 * times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
 		else
 #endif
 			in_losing(tp->t_inpcb);
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
 	if (rack_use_sack_filter)
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	tp->snd_recover = tp->snd_max;
 	tp->t_flags |= TF_ACKNOW;
 	tp->t_rtttime = 0;
 	rack_cong_signal(tp, NULL, CC_RTO);
 out:
 	return (retval);
 }
 
 static int
 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
 {
 	int32_t ret = 0;
 	int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
 
 	if (timers == 0) {
 		return (0);
 	}
 	if (tp->t_state == TCPS_LISTEN) {
 		/* no timers on listen sockets */
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
 			return (0);
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		uint32_t left;
 
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 			ret = -1;
 			rack_log_to_processing(rack, cts, ret, 0);
 			return (0);
 		}
 		if (hpts_calling == 0) {
 			ret = -2;
 			rack_log_to_processing(rack, cts, ret, 0);
 			return (0);
 		}
 		/*
 		 * Ok our timer went off early and we are not paced false
 		 * alarm, go back to sleep.
 		 */
 		ret = -3;
 		left = rack->r_ctl.rc_timer_exp - cts;
 		tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
 		rack_log_to_processing(rack, cts, ret, left);
 		rack->rc_last_pto_set = 0;
 		return (1);
 	}
 	rack->rc_tmr_stopped = 0;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
 	if (timers & PACE_TMR_DELACK) {
 		ret = rack_timeout_delack(tp, rack, cts);
 	} else if (timers & PACE_TMR_RACK) {
 		ret = rack_timeout_rack(tp, rack, cts);
 	} else if (timers & PACE_TMR_TLP) {
 		ret = rack_timeout_tlp(tp, rack, cts);
 	} else if (timers & PACE_TMR_RXT) {
 		ret = rack_timeout_rxt(tp, rack, cts);
 	} else if (timers & PACE_TMR_PERSIT) {
 		ret = rack_timeout_persist(tp, rack, cts);
 	} else if (timers & PACE_TMR_KEEP) {
 		ret = rack_timeout_keepalive(tp, rack, cts);
 	}
 	rack_log_to_processing(rack, cts, ret, timers);
 	return (ret);
 }
 
 static void
 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
 {
 	uint8_t hpts_removed = 0;
 
 	if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 	    TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
 		tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 		hpts_removed = 1;
 	}
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 		if (rack->rc_inp->inp_in_hpts &&
 		    ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
 			/*
 			 * Canceling timer's when we have no output being
 			 * paced. We also must remove ourselves from the
 			 * hpts.
 			 */
 			tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 			hpts_removed = 1;
 		}
 		rack_log_to_cancel(rack, hpts_removed, line);
 		rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
 	}
 }
 
 static void
 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	return;
 }
 
 static int
 rack_stopall(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack->t_timers_stopped = 1;
 	return (0);
 }
 
 static void
 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
 {
 	return;
 }
 
 static int
 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	return (0);
 }
 
 static void
 rack_stop_all_timers(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 
 	/*
 	 * Assure no timers are running.
 	 */
 	if (tcp_timer_active(tp, TT_PERSIST)) {
 		/* We enter in persists, set the flag appropriately */
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		rack->rc_in_persist = 1;
 	}
 	tcp_timer_suspend(tp, TT_PERSIST);
 	tcp_timer_suspend(tp, TT_REXMT);
 	tcp_timer_suspend(tp, TT_KEEP);
 	tcp_timer_suspend(tp, TT_DELACK);
 }
 
 static void
 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts)
 {
 	int32_t idx;
 
 	rsm->r_rtr_cnt++;
 	rsm->r_sndcnt++;
 	if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
 		rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
 		rsm->r_flags |= RACK_OVERMAX;
 	}
 	if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
 		rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
 		rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	rsm->r_tim_lastsent[idx] = ts;
 	if (rsm->r_flags & RACK_ACKED) {
 		/* Problably MTU discovery messing with us */
 		rsm->r_flags &= ~RACK_ACKED;
 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 	}
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 	}
 	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 	rsm->r_in_tmap = 1;
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		/* We have retransmitted due to the SACK pass */
 		rsm->r_flags &= ~RACK_SACK_PASSED;
 		rsm->r_flags |= RACK_WAS_SACKPASS;
 	}
 	/* Update memory for next rtr */
 	rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
 }
 
 
 static uint32_t
 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
 {
 	/*
 	 * We (re-)transmitted starting at rsm->r_start for some length
 	 * (possibly less than r_end.
 	 */
 	struct rack_sendmap *nrsm;
 	uint32_t c_end;
 	int32_t len;
 	int32_t idx;
 
 	len = *lenp;
 	c_end = rsm->r_start + len;
 	if (SEQ_GEQ(c_end, rsm->r_end)) {
 		/*
 		 * We retransmitted the whole piece or more than the whole
 		 * slopping into the next rsm.
 		 */
 		rack_update_rsm(tp, rack, rsm, ts);
 		if (c_end == rsm->r_end) {
 			*lenp = 0;
 			return (0);
 		} else {
 			int32_t act_len;
 
 			/* Hangs over the end return whats left */
 			act_len = rsm->r_end - rsm->r_start;
 			*lenp = (len - act_len);
 			return (rsm->r_end);
 		}
 		/* We don't get out of this block. */
 	}
 	/*
 	 * Here we retransmitted less than the whole thing which means we
 	 * have to split this into what was transmitted and what was not.
 	 */
 	nrsm = rack_alloc(rack);
 	if (nrsm == NULL) {
 		/*
 		 * We can't get memory, so lets not proceed.
 		 */
 		*lenp = 0;
 		return (0);
 	}
 	/*
 	 * So here we are going to take the original rsm and make it what we
 	 * retransmitted. nrsm will be the tail portion we did not
 	 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
 	 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
 	 * 1, 6 and the new piece will be 6, 11.
 	 */
 	nrsm->r_start = c_end;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 	nrsm->r_flags = rsm->r_flags;
 	nrsm->r_sndcnt = rsm->r_sndcnt;
 	nrsm->r_rtr_bytes = 0;
 	rsm->r_end = c_end;
 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 	}
 	TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	rsm->r_flags &= (~RACK_HAS_FIN);
 	rack_update_rsm(tp, rack, rsm, ts);
 	*lenp = 0;
 	return (0);
 }
 
 
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
     uint8_t pass, struct rack_sendmap *hintrsm)
 {
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm, *nrsm;
 	register uint32_t snd_max, snd_una;
 	int32_t idx;
 
 	/*
 	 * Add to the RACK log of packets in flight or retransmitted. If
 	 * there is a TS option we will use the TS echoed, if not we will
 	 * grab a TS.
 	 *
 	 * Retransmissions will increment the count and move the ts to its
 	 * proper place. Note that if options do not include TS's then we
 	 * won't be able to effectively use the ACK for an RTT on a retran.
 	 *
 	 * Notes about r_start and r_end. Lets consider a send starting at
 	 * sequence 1 for 10 bytes. In such an example the r_start would be
 	 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
 	 * This means that r_end is actually the first sequence for the next
 	 * slot (11).
 	 *
 	 */
 	/*
 	 * If err is set what do we do XXXrrs? should we not add the thing?
 	 * -- i.e. return if err != 0 or should we pretend we sent it? --
 	 * i.e. proceed with add ** do this for now.
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (err)
 		/*
 		 * We don't log errors -- we could but snd_max does not
 		 * advance in this case either.
 		 */
 		return;
 
 	if (th_flags & TH_RST) {
 		/*
 		 * We don't log resets and we return immediately from
 		 * sending
 		 */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	snd_una = tp->snd_una;
 	if (SEQ_LEQ((seq_out + len), snd_una)) {
 		/* Are sending an old segment to induce an ack (keep-alive)? */
 		return;
 	}
 	if (SEQ_LT(seq_out, snd_una)) {
 		/* huh? should we panic? */
 		uint32_t end;
 
 		end = seq_out + len;
 		seq_out = snd_una;
 		len = end - seq_out;
 	}
 	snd_max = tp->snd_max;
 	if (th_flags & (TH_SYN | TH_FIN)) {
 		/*
 		 * The call to rack_log_output is made before bumping
 		 * snd_max. This means we can record one extra byte on a SYN
 		 * or FIN if seq_out is adding more on and a FIN is present
 		 * (and we are not resending).
 		 */
 		if (th_flags & TH_SYN)
 			len++;
 		if (th_flags & TH_FIN)
 			len++;
 		if (SEQ_LT(snd_max, tp->snd_nxt)) {
 			/*
 			 * The add/update as not been done for the FIN/SYN
 			 * yet.
 			 */
 			snd_max = tp->snd_nxt;
 		}
 	}
 	if (len == 0) {
 		/* We don't log zero window probes */
 		return;
 	}
 	rack->r_ctl.rc_time_last_sent = ts;
 	if (IN_RECOVERY(tp->t_flags)) {
 		rack->r_ctl.rc_prr_out += len;
 	}
 	/* First question is it a retransmission? */
 	if (seq_out == snd_max) {
 again:
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			/*
 			 * Hmm out of memory and the tcb got destroyed while
 			 * we tried to wait.
 			 */
 #ifdef INVARIANTS
 			panic("Out of memory when we should not be rack:%p", rack);
 #endif
 			return;
 		}
 		if (th_flags & TH_FIN) {
 			rsm->r_flags = RACK_HAS_FIN;
 		} else {
 			rsm->r_flags = 0;
 		}
 		rsm->r_tim_lastsent[0] = ts;
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		if (th_flags & TH_SYN) {
 			/* The data space is one beyond snd_una */
 			rsm->r_start = seq_out + 1;
 			rsm->r_end = rsm->r_start + (len - 1);
 		} else {
 			/* Normal case */
 			rsm->r_start = seq_out;
 			rsm->r_end = rsm->r_start + len;
 		}
 		rsm->r_sndcnt = 0;
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 		return;
 	}
 	/*
 	 * If we reach here its a retransmission and we need to find it.
 	 */
 more:
 	if (hintrsm && (hintrsm->r_start == seq_out)) {
 		rsm = hintrsm;
 		hintrsm = NULL;
 	} else if (rack->r_ctl.rc_next) {
 		/* We have a hint from a previous run */
 		rsm = rack->r_ctl.rc_next;
 	} else {
 		/* No hints sorry */
 		rsm = NULL;
 	}
 	if ((rsm) && (rsm->r_start == seq_out)) {
 		/*
 		 * We used rc_next or hintrsm  to retransmit, hopefully the
 		 * likely case.
 		 */
 		seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
 		if (len == 0) {
 			return;
 		} else {
 			goto more;
 		}
 	}
 	/* Ok it was not the last pointer go through it the hard way. */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
 		if (rsm->r_start == seq_out) {
 			seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
 			rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
 			if (len == 0) {
 				return;
 			} else {
 				continue;
 			}
 		}
 		if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
 			/* Transmitted within this piece */
 			/*
 			 * Ok we must split off the front and then let the
 			 * update do the rest
 			 */
 			nrsm = rack_alloc(rack);
 			if (nrsm == NULL) {
 #ifdef INVARIANTS
 				panic("Ran out of memory that was preallocated? rack:%p", rack);
 #endif
 				rack_update_rsm(tp, rack, rsm, ts);
 				return;
 			}
 			/*
 			 * copy rsm to nrsm and then trim the front of rsm
 			 * to not include this part.
 			 */
 			nrsm->r_start = seq_out;
 			nrsm->r_end = rsm->r_end;
 			nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 			nrsm->r_flags = rsm->r_flags;
 			nrsm->r_sndcnt = rsm->r_sndcnt;
 			nrsm->r_rtr_bytes = 0;
 			for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 				nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 			}
 			rsm->r_end = nrsm->r_start;
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 			if (rsm->r_in_tmap) {
 				TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 				nrsm->r_in_tmap = 1;
 			}
 			rsm->r_flags &= (~RACK_HAS_FIN);
 			seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
 			if (len == 0) {
 				return;
 			}
 		}
 	}
 	/*
 	 * Hmm not found in map did they retransmit both old and on into the
 	 * new?
 	 */
 	if (seq_out == tp->snd_max) {
 		goto again;
 	} else if (SEQ_LT(seq_out, tp->snd_max)) {
 #ifdef INVARIANTS
 		printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
 		    seq_out, len, tp->snd_una, tp->snd_max);
 		printf("Starting Dump of all rack entries\n");
 		TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
 			printf("rsm:%p start:%u end:%u\n",
 			    rsm, rsm->r_start, rsm->r_end);
 		}
 		printf("Dump complete\n");
 		panic("seq_out not found rack:%p tp:%p",
 		    rack, tp);
 #endif
 	} else {
 #ifdef INVARIANTS
 		/*
 		 * Hmm beyond sndmax? (only if we are using the new rtt-pack
 		 * flag)
 		 */
 		panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
 		    seq_out, len, tp->snd_max, tp);
 #endif
 	}
 }
 
 /*
  * Record one of the RTT updates from an ack into
  * our sample structure.
  */
 static void
 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
 {
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
 		rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
 	}
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
 		rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
 	}
 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
 	rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
 	rack->r_ctl.rack_rs.rs_rtt_cnt++;
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
 {
 	int32_t delta;
 	uint32_t o_srtt, o_var;
 	int32_t rtt;
 
 	if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
 		/* No valid sample */
 		return;
 	if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
 		/* We are to use the lowest RTT seen in a single ack */
 		rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
 		/* We are to use the highest RTT seen in a single ack */
 		rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
 		/* We are to use the average RTT seen in a single ack */
 		rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
 				(uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
 	} else {
 #ifdef INVARIANTS
 		panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
 #endif		
 		return;
 	}
 	if (rtt == 0)
 		rtt = 1;
 	rack_log_rtt_sample(rack, rtt);
 	o_srtt = tp->t_srtt;
 	o_var = tp->t_rttvar;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic is
 		 * equivalent to the smoothing algorithm in rfc793 with an
 		 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
 		 * Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 		    - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		tp->t_srtt += delta;
 		if (tp->t_srtt <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit timer
 		 * to smoothed rtt + 4 times the smoothed variance. rttvar
 		 * is stored as fixed point with 4 bits after the binary
 		 * point (scaled by 16).  The following is equivalent to
 		 * rfc793 smoothing with an alpha of .75 (rttvar =
 		 * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
 		 * wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		tp->t_rttvar += delta;
 		if (tp->t_rttvar <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 			tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt. Set the
 		 * variance to half the rtt (so our first retransmit happens
 		 * at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	TCPSTAT_INC(tcps_rttupdated);
 	rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
 	tp->t_rttupdated++;
 #ifdef NETFLIX_STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
 #endif
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar. Because of the
 	 * way we do the smoothing, srtt and rttvar will each average +1/2
 	 * tick of bias.  When we compute the retransmit timer, we want 1/2
 	 * tick of rounding and 1 extra tick because of +-1/2 tick
 	 * uncertainty in the firing of the timer.  The bias will give us
 	 * exactly the 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below the minimum
 	 * feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 	   max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
 	tp->t_softerror = 0;
 }
 
 static void
 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
     uint32_t t, uint32_t cts)
 {
 	/*
 	 * For this RSM, we acknowledged the data from a previous
 	 * transmission, not the last one we made. This means we did a false
 	 * retransmit.
 	 */
 	struct tcp_rack *rack;
 
 	if (rsm->r_flags & RACK_HAS_FIN) {
 		/*
 		 * The sending of the FIN often is multiple sent when we
 		 * have everything outstanding ack'd. We ignore this case
 		 * since its over now.
 		 */
 		return;
 	}
 	if (rsm->r_flags & RACK_TLP) {
 		/*
 		 * We expect TLP's to have this occur.
 		 */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	/* should we undo cc changes and exit recovery? */
 	if (IN_RECOVERY(tp->t_flags)) {
 		if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
 			/*
 			 * Undo what we ratched down and exit recovery if
 			 * possible
 			 */
 			EXIT_RECOVERY(tp->t_flags);
 			tp->snd_recover = tp->snd_una;
 			if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
 				tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
 			if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
 				tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
 		}
 	}
 	if (rsm->r_flags & RACK_WAS_SACKPASS) {
 		/*
 		 * We retransmitted based on a sack and the earlier
 		 * retransmission ack'd it - re-ordering is occuring.
 		 */
 		counter_u64_add(rack_reorder_seen, 1);
 		rack->r_ctl.rc_reorder_ts = cts;
 	}
 	counter_u64_add(rack_badfr, 1);
 	counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
 }
 
 
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
 {
 	int32_t i;
 	uint32_t t;
 
 	if (rsm->r_flags & RACK_ACKED)
 		/* Already done */
 		return (0);
 
 
 	if ((rsm->r_rtr_cnt == 1) ||
 	    ((ack_type == CUM_ACKED) &&
 	    (to->to_flags & TOF_TS) &&
 	    (to->to_tsecr) &&
 	    (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
 	    ) {
 		/*
 		 * We will only find a matching timestamp if its cum-acked.
 		 * But if its only one retransmission its for-sure matching
 		 * :-)
 		 */
 		t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 		if ((int)t <= 0)
 			t = 1;
 		if (!tp->t_rttlow || tp->t_rttlow > t)
 			tp->t_rttlow = t;
 		if (!rack->r_ctl.rc_rack_min_rtt ||
 		    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 			rack->r_ctl.rc_rack_min_rtt = t;
 			if (rack->r_ctl.rc_rack_min_rtt == 0) {
 				rack->r_ctl.rc_rack_min_rtt = 1;
 			}
 		}
 		tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
 		if ((rsm->r_flags & RACK_TLP) &&
 		    (!IN_RECOVERY(tp->t_flags))) {
 			/* Segment was a TLP and our retrans matched */
 			if (rack->r_ctl.rc_tlp_cwnd_reduce) {
 				rack->r_ctl.rc_rsm_start = tp->snd_max;
 				rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 				rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 				rack_cong_signal(tp, NULL, CC_NDUPACK);
 				/*
 				 * When we enter recovery we need to assure
 				 * we send one packet.
 				 */
 				rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 			} else
 				rack->r_ctl.rc_tlp_rtx_out = 0;
 		}
 		if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
 			/* New more recent rack_tmit_time */
 			rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 			rack->rc_rack_rtt = t;
 		}
 		return (1);
 	}
 	/* 
 	 * We clear the soft/rxtshift since we got an ack. 
 	 * There is no assurance we will call the commit() function
 	 * so we need to clear these to avoid incorrect handling.
 	 */
 	tp->t_rxtshift = 0;
 	tp->t_softerror = 0;
 	if ((to->to_flags & TOF_TS) &&
 	    (ack_type == CUM_ACKED) &&
 	    (to->to_tsecr) &&
 	    ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
 		/*
 		 * Now which timestamp does it match? In this block the ACK
 		 * must be coming from a previous transmission.
 		 */
 		for (i = 0; i < rsm->r_rtr_cnt; i++) {
 			if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
 				t = cts - rsm->r_tim_lastsent[i];
 				if ((int)t <= 0)
 					t = 1;
 				if ((i + 1) < rsm->r_rtr_cnt) {
 					/* Likely */
 					rack_earlier_retran(tp, rsm, t, cts);
 				}
 				if (!tp->t_rttlow || tp->t_rttlow > t)
 					tp->t_rttlow = t;
 				if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 					rack->r_ctl.rc_rack_min_rtt = t;
 					if (rack->r_ctl.rc_rack_min_rtt == 0) {
 						rack->r_ctl.rc_rack_min_rtt = 1;
 					}
 				}
                                 /*
 				 * Note the following calls to
 				 * tcp_rack_xmit_timer() are being commented
 				 * out for now. They give us no more accuracy
 				 * and often lead to a wrong choice. We have
 				 * enough samples that have not been 
 				 * retransmitted. I leave the commented out
 				 * code in here in case in the future we
 				 * decide to add it back (though I can't forsee
 				 * doing that). That way we will easily see
 				 * where they need to be placed.
 				 */
 				if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
 				    rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
 					/* New more recent rack_tmit_time */
 					rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 					rack->rc_rack_rtt = t;
 				}
 				return (1);
 			}
 		}
 		goto ts_not_found;
 	} else {
 		/*
 		 * Ok its a SACK block that we retransmitted. or a windows
 		 * machine without timestamps. We can tell nothing from the
 		 * time-stamp since its not there or the time the peer last
 		 * recieved a segment that moved forward its cum-ack point.
 		 */
 ts_not_found:
 		i = rsm->r_rtr_cnt - 1;
 		t = cts - rsm->r_tim_lastsent[i];
 		if ((int)t <= 0)
 			t = 1;
 		if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 			/*
 			 * We retransmitted and the ack came back in less
 			 * than the smallest rtt we have observed. We most
 			 * likey did an improper retransmit as outlined in
 			 * 4.2 Step 3 point 2 in the rack-draft.
 			 */
 			i = rsm->r_rtr_cnt - 2;
 			t = cts - rsm->r_tim_lastsent[i];
 			rack_earlier_retran(tp, rsm, t, cts);
 		} else if (rack->r_ctl.rc_rack_min_rtt) {
 			/*
 			 * We retransmitted it and the retransmit did the
 			 * job.
 			 */
 			if (!rack->r_ctl.rc_rack_min_rtt ||
 			    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 				rack->r_ctl.rc_rack_min_rtt = t;
 				if (rack->r_ctl.rc_rack_min_rtt == 0) {
 					rack->r_ctl.rc_rack_min_rtt = 1;
 				}
 			}
 			if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
 				/* New more recent rack_tmit_time */
 				rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
 				rack->rc_rack_rtt = t;
 			}
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
  */
 static void
 rack_log_sack_passed(struct tcpcb *tp,
     struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	struct rack_sendmap *nrsm;
 	uint32_t ts;
 	int32_t idx;
 
 	idx = rsm->r_rtr_cnt - 1;
 	ts = rsm->r_tim_lastsent[idx];
 	nrsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
 	    rack_head, r_tnext) {
 		if (nrsm == rsm) {
 			/* Skip orginal segment he is acked */
 			continue;
 		}
 		if (nrsm->r_flags & RACK_ACKED) {
 			/* Skip ack'd segments */
 			continue;
 		}
 		idx = nrsm->r_rtr_cnt - 1;
 		if (ts == nrsm->r_tim_lastsent[idx]) {
 			/*
 			 * For this case lets use seq no, if we sent in a
 			 * big block (TSO) we would have a bunch of segments
 			 * sent at the same time.
 			 *
 			 * We would only get a report if its SEQ is earlier.
 			 * If we have done multiple retransmits the times
 			 * would not be equal.
 			 */
 			if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
 				nrsm->r_flags |= RACK_SACK_PASSED;
 				nrsm->r_flags &= ~RACK_WAS_SACKPASS;
 			}
 		} else {
 			/*
 			 * Here they were sent at different times, not a big
 			 * block. Since we transmitted this one later and
 			 * see it sack'd then this must also be missing (or
 			 * we would have gotten a sack block for it)
 			 */
 			nrsm->r_flags |= RACK_SACK_PASSED;
 			nrsm->r_flags &= ~RACK_WAS_SACKPASS;
 		}
 	}
 }
 
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
 {
 	int32_t idx;
 	int32_t times = 0;
 	uint32_t start, end, changed = 0;
 	struct rack_sendmap *rsm, *nrsm;
 	int32_t used_ref = 1;
 
 	start = sack->start;
 	end = sack->end;
 	rsm = *prsm;
 	if (rsm && SEQ_LT(start, rsm->r_start)) {
 		TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
 			if (SEQ_GEQ(start, rsm->r_start) &&
 			    SEQ_LT(start, rsm->r_end)) {
 				goto do_rest_ofb;
 			}
 		}
 	}
 	if (rsm == NULL) {
 start_at_beginning:
 		rsm = NULL;
 		used_ref = 0;
 	}
 	/* First lets locate the block where this guy is */
 	TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
 		if (SEQ_GEQ(start, rsm->r_start) &&
 		    SEQ_LT(start, rsm->r_end)) {
 			break;
 		}
 	}
 do_rest_ofb:
 	if (rsm == NULL) {
 		/*
 		 * This happens when we get duplicate sack blocks with the
 		 * same end. For example SACK 4: 100 SACK 3: 100 The sort
 		 * will not change there location so we would just start at
 		 * the end of the first one and get lost.
 		 */
 		if (tp->t_flags & TF_SENTFIN) {
 			/*
 			 * Check to see if we have not logged the FIN that
 			 * went out.
 			 */
 			nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
 			if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
 				/*
 				 * Ok we did not get the FIN logged.
 				 */
 				nrsm->r_end++;
 				rsm = nrsm;
 				goto do_rest_ofb;
 			}
 		}
 		if (times == 1) {
 #ifdef INVARIANTS
 			panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
 			    tp, rack, sack, to, prsm);
 #else
 			goto out;
 #endif
 		}
 		times++;
 		counter_u64_add(rack_sack_proc_restart, 1);
 		goto start_at_beginning;
 	}
 	/* Ok we have an ACK for some piece of rsm */
 	if (rsm->r_start != start) {
 		/*
 		 * Need to split this in two pieces the before and after.
 		 */
 		nrsm = rack_alloc(rack);
 		if (nrsm == NULL) {
 			/*
 			 * failed XXXrrs what can we do but loose the sack
 			 * info?
 			 */
 			goto out;
 		}
 		nrsm->r_start = start;
 		nrsm->r_rtr_bytes = 0;
 		nrsm->r_end = rsm->r_end;
 		nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 		nrsm->r_flags = rsm->r_flags;
 		nrsm->r_sndcnt = rsm->r_sndcnt;
 		for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 			nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 		}
 		rsm->r_end = nrsm->r_start;
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm->r_flags &= (~RACK_HAS_FIN);
 		rsm = nrsm;
 	}
 	if (SEQ_GEQ(end, rsm->r_end)) {
 		/*
 		 * The end of this block is either beyond this guy or right
 		 * at this guy.
 		 */
 
 		if ((rsm->r_flags & RACK_ACKED) == 0) {
 			rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
 			changed += (rsm->r_end - rsm->r_start);
 			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 			rack_log_sack_passed(tp, rack, rsm);
 			/* Is Reordering occuring? */
 			if (rsm->r_flags & RACK_SACK_PASSED) {
 				counter_u64_add(rack_reorder_seen, 1);
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			rsm->r_flags |= RACK_ACKED;
 			rsm->r_flags &= ~RACK_TLP;
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 		}
 		if (end == rsm->r_end) {
 			/* This block only - done */
 			goto out;
 		}
 		/* There is more not coverend by this rsm move on */
 		start = rsm->r_end;
 		nrsm = TAILQ_NEXT(rsm, r_next);
 		rsm = nrsm;
 		times = 0;
 		goto do_rest_ofb;
 	}
 	/* Ok we need to split off this one at the tail */
 	nrsm = rack_alloc(rack);
 	if (nrsm == NULL) {
 		/* failed rrs what can we do but loose the sack info? */
 		goto out;
 	}
 	/* Clone it */
 	nrsm->r_start = end;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_bytes = 0;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 	nrsm->r_flags = rsm->r_flags;
 	nrsm->r_sndcnt = rsm->r_sndcnt;
 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 	}
 	/* The sack block does not cover this guy fully */
 	rsm->r_flags &= (~RACK_HAS_FIN);
 	rsm->r_end = end;
 	TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		/* Been here done that */
 		goto out;
 	}
 	rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
 	changed += (rsm->r_end - rsm->r_start);
 	rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 	rack_log_sack_passed(tp, rack, rsm);
 	/* Is Reordering occuring? */
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		counter_u64_add(rack_reorder_seen, 1);
 		rack->r_ctl.rc_reorder_ts = cts;
 	}
 	rsm->r_flags |= RACK_ACKED;
 	rsm->r_flags &= ~RACK_TLP;
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 0;
 	}
 out:
 	if (used_ref == 0) {
 		counter_u64_add(rack_sack_proc_all, 1);
 	} else {
 		counter_u64_add(rack_sack_proc_short, 1);
 	}
 	/* Save off where we last were */
 	if (rsm)
 		rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
 	else
 		rack->r_ctl.rc_sacklast = NULL;
 	*prsm = rsm;
 	return (changed);
 }
 
 static void inline 
 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
 {
 	struct rack_sendmap *tmap;
 
 	tmap = NULL;
 	while (rsm && (rsm->r_flags & RACK_ACKED)) {
 		/* Its no longer sacked, mark it so */
 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 #ifdef INVARIANTS
 		if (rsm->r_in_tmap) {
 			panic("rack:%p rsm:%p flags:0x%x in tmap?",
 			      rack, rsm, rsm->r_flags);
 		}
 #endif
 		rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
 		/* Rebuild it into our tmap */
 		if (tmap == NULL) {
 			TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 			tmap = rsm;
 		} else {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
 			tmap = rsm;
 		}
 		tmap->r_in_tmap = 1;
 		rsm = TAILQ_NEXT(rsm, r_next);
 	}
 	/* 
 	 * Now lets possibly clear the sack filter so we start 
 	 * recognizing sacks that cover this area.
 	 */
 	if (rack_use_sack_filter)
 		sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
 
 }
 
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
 {
 	uint32_t changed, last_seq, entered_recovery = 0;
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
 	register uint32_t th_ack;
 	int32_t i, j, k, num_sack_blks = 0;
 	uint32_t cts, acked, ack_point, sack_changed = 0;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (th->th_flags & TH_RST) {
 		/* We don't log resets */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	cts = tcp_ts_getticks();
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 	changed = 0;
 	th_ack = th->th_ack;
 
 	if (SEQ_GT(th_ack, tp->snd_una)) {
 		rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
 		tp->t_acktime = ticks;
 	}
 	if (rsm && SEQ_GT(th_ack, rsm->r_start))
 		changed = th_ack - rsm->r_start;
 	if (changed) {
 		/*
 		 * The ACK point is advancing to th_ack, we must drop off
 		 * the packets in the rack log and calculate any eligble
 		 * RTT's.
 		 */
 		rack->r_wanted_output++;
 more:
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 		if (rsm == NULL) {
 			if ((th_ack - 1) == tp->iss) {
 				/*
 				 * For the SYN incoming case we will not
 				 * have called tcp_output for the sending of
 				 * the SYN, so there will be no map. All
 				 * other cases should probably be a panic.
 				 */
 				goto proc_sack;
 			}
 			if (tp->t_flags & TF_SENTFIN) {
 				/* if we send a FIN we will not hav a map */
 				goto proc_sack;
 			}
 #ifdef INVARIANTS
 			panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
 			    tp,
 			    th, tp->t_state, rack,
 			    tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
 #endif
 			goto proc_sack;
 		}
 		if (SEQ_LT(th_ack, rsm->r_start)) {
 			/* Huh map is missing this */
 #ifdef INVARIANTS
 			printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
 			    rsm->r_start,
 			    th_ack, tp->t_state, rack->r_state);
 #endif
 			goto proc_sack;
 		}
 		rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
 		/* Now do we consume the whole thing? */
 		if (SEQ_GEQ(th_ack, rsm->r_end)) {
 			/* Its all consumed. */
 			uint32_t left;
 
 			rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 			rsm->r_rtr_bytes = 0;
 			TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 			if (rack->r_ctl.rc_next == rsm) {
 				/* scoot along the marker */
 				rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
 			}
 			if (rsm->r_flags & RACK_ACKED) {
 				/*
 				 * It was acked on the scoreboard -- remove
 				 * it from total
 				 */
 				rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 			} else if (rsm->r_flags & RACK_SACK_PASSED) {
 				/*
 				 * There are acked segments ACKED on the
 				 * scoreboard further up. We are seeing
 				 * reordering.
 				 */
 				counter_u64_add(rack_reorder_seen, 1);
 				rsm->r_flags |= RACK_ACKED;
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			left = th_ack - rsm->r_end;
 			if (rsm->r_rtr_cnt > 1) {
 				/*
 				 * Technically we should make r_rtr_cnt be
 				 * monotonicly increasing and just mod it to
 				 * the timestamp it is replacing.. that way
 				 * we would have the last 3 retransmits. Now
 				 * rc_loss_count will be wrong if we
 				 * retransmit something more than 2 times in
 				 * recovery :(
 				 */
 				rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
 			}
 			/* Free back to zone */
 			rack_free(rack, rsm);
 			if (left) {
 				goto more;
 			}
 			goto proc_sack;
 		}
 		if (rsm->r_flags & RACK_ACKED) {
 			/*
 			 * It was acked on the scoreboard -- remove it from
 			 * total for the part being cum-acked.
 			 */
 			rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
 		}
 		rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 		rsm->r_rtr_bytes = 0;
 		rsm->r_start = th_ack;
 	}
 proc_sack:
 	/* Check for reneging */
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 	if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
 		/*
 		 * The peer has moved snd_una up to
 		 * the edge of this send, i.e. one
 		 * that it had previously acked. The only
 		 * way that can be true if the peer threw
 		 * away data (space issues) that it had
 		 * previously sacked (else it would have 
 		 * given us snd_una up to (rsm->r_end).
 		 * We need to undo the acked markings here.
 		 *
 		 * Note we have to look to make sure th_ack is
 		 * our rsm->r_start in case we get an old ack
 		 * where th_ack is behind snd_una.
 		 */
 		rack_peer_reneges(rack, rsm, th->th_ack);
 	}
 	if ((to->to_flags & TOF_SACK) == 0) {
 		/* We are done nothing left to log */
 		goto out;
 	}
 	rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
 	if (rsm) {
 		last_seq = rsm->r_end;
 	} else {
 		last_seq = tp->snd_max;
 	}
 	/* Sack block processing */
 	if (SEQ_GT(th_ack, tp->snd_una))
 		ack_point = th_ack;
 	else
 		ack_point = tp->snd_una;
 	for (i = 0; i < to->to_nsacks; i++) {
 		bcopy((to->to_sacks + i * TCPOLEN_SACK),
 		    &sack, sizeof(sack));
 		sack.start = ntohl(sack.start);
 		sack.end = ntohl(sack.end);
 		if (SEQ_GT(sack.end, sack.start) &&
 		    SEQ_GT(sack.start, ack_point) &&
 		    SEQ_LT(sack.start, tp->snd_max) &&
 		    SEQ_GT(sack.end, ack_point) &&
 		    SEQ_LEQ(sack.end, tp->snd_max)) {
 			if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
 			    (SEQ_LT(sack.end, last_seq)) &&
 			    ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
 				/*
 				 * Not the last piece and its smaller than
 				 * 1/8th of a MSS. We ignore this.
 				 */
 				counter_u64_add(rack_runt_sacks, 1);
 				continue;
 			}
 			sack_blocks[num_sack_blks] = sack;
 			num_sack_blks++;
 #ifdef NETFLIX_STATS
 		} else if (SEQ_LEQ(sack.start, th_ack) &&
 			   SEQ_LEQ(sack.end, th_ack)) {
 			/*
 			 * Its a D-SACK block.
 			 */
 			tcp_record_dsack(sack.start, sack.end);
 #endif
 		}
 
 	}
 	if (num_sack_blks == 0)
 		goto out;
 	/*
 	 * Sort the SACK blocks so we can update the rack scoreboard with
 	 * just one pass.
 	 */
 	if (rack_use_sack_filter) {
 		num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
 	}
 	if (num_sack_blks < 2) {
 		goto do_sack_work;
 	}
 	/* Sort the sacks */
 	for (i = 0; i < num_sack_blks; i++) {
 		for (j = i + 1; j < num_sack_blks; j++) {
 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 				sack = sack_blocks[i];
 				sack_blocks[i] = sack_blocks[j];
 				sack_blocks[j] = sack;
 			}
 		}
 	}
 	/*
 	 * Now are any of the sack block ends the same (yes some
 	 * implememtations send these)?
 	 */
 again:
 	if (num_sack_blks > 1) {
 		for (i = 0; i < num_sack_blks; i++) {
 			for (j = i + 1; j < num_sack_blks; j++) {
 				if (sack_blocks[i].end == sack_blocks[j].end) {
 					/*
 					 * Ok these two have the same end we
 					 * want the smallest end and then
 					 * throw away the larger and start
 					 * again.
 					 */
 					if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
 						/*
 						 * The second block covers
 						 * more area use that
 						 */
 						sack_blocks[i].start = sack_blocks[j].start;
 					}
 					/*
 					 * Now collapse out the dup-sack and
 					 * lower the count
 					 */
 					for (k = (j + 1); k < num_sack_blks; k++) {
 						sack_blocks[j].start = sack_blocks[k].start;
 						sack_blocks[j].end = sack_blocks[k].end;
 						j++;
 					}
 					num_sack_blks--;
 					goto again;
 				}
 			}
 		}
 	}
 do_sack_work:
 	rsm = rack->r_ctl.rc_sacklast;
 	for (i = 0; i < num_sack_blks; i++) {
 		acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
 		if (acked) {
 			rack->r_wanted_output++;
 			changed += acked;
 			sack_changed += acked;
 		}
 	}
 out:
 	if (changed) {
 		/* Something changed cancel the rack timer */
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	}
 	if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
 		/*
 		 * Ok we have a high probability that we need to go in to
 		 * recovery since we have data sack'd
 		 */
 		struct rack_sendmap *rsm;
 		uint32_t tsused;
 
 		tsused = tcp_ts_getticks();
 		rsm = tcp_rack_output(tp, rack, tsused);
 		if (rsm) {
 			/* Enter recovery */
 			rack->r_ctl.rc_rsm_start = rsm->r_start;
 			rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 			rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 			entered_recovery = 1;
 			rack_cong_signal(tp, NULL, CC_NDUPACK);
 			/*
 			 * When we enter recovery we need to assure we send
 			 * one packet.
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 			rack->r_timer_override = 1;
 		}
 	}
 	if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
 		/* Deal with changed an PRR here (in recovery only) */
 		uint32_t pipe, snd_una;
 
 		rack->r_ctl.rc_prr_delivered += changed;
 		/* Compute prr_sndcnt */
 		if (SEQ_GT(tp->snd_una, th_ack)) {
 			snd_una = tp->snd_una;
 		} else {
 			snd_una = th_ack;
 		}
 		pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
 		if (pipe > tp->snd_ssthresh) {
 			long sndcnt;
 
 			sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
 			if (rack->r_ctl.rc_prr_recovery_fs > 0)
 				sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
 			else {
 				rack->r_ctl.rc_prr_sndcnt = 0;
 				sndcnt = 0;
 			}
 			sndcnt++;
 			if (sndcnt > (long)rack->r_ctl.rc_prr_out)
 				sndcnt -= rack->r_ctl.rc_prr_out;
 			else
 				sndcnt = 0;
 			rack->r_ctl.rc_prr_sndcnt = sndcnt;
 		} else {
 			uint32_t limit;
 
 			if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
 				limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
 			else
 				limit = 0;
 			if (changed > limit)
 				limit = changed;
 			limit += tp->t_maxseg;
 			if (tp->snd_ssthresh > pipe) {
 				rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
 			} else {
 				rack->r_ctl.rc_prr_sndcnt = min(0, limit);
 			}
 		}
 		if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
 			rack->r_timer_override = 1;
 		}
 	}
 }
 
 /*
  * Return value of 1, we do not need to call rack_process_data().
  * return value of 0, rack_process_data can be called.
  * For ret_val if its 0 the TCP is locked, if its non-zero
  * its unlocked and probably unsafe to touch the TCB.
  */
 static int
 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to,
     int32_t * ti_locked, uint32_t tiwin, int32_t tlen,
     int32_t * ofia, int32_t thflags, int32_t * ret_val)
 {
 	int32_t ourfinisacked = 0;
 	int32_t nsegs, acked_amount;
 	int32_t acked;
 	struct mbuf *mfree;
 	struct tcp_rack *rack;
 	int32_t recovery = 0;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (SEQ_GT(th->th_ack, tp->snd_max)) {
 		rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val);
 		return (1);
 	}
 	if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
 		rack_log_ack(tp, to, th);
 	}
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/*
 		 * Old ack, behind (or duplicate to) the last one rcv'd
 		 * Note: Should mark reordering is occuring! We should also
 		 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
 		 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
 		 * retran and> ack 3
 		 */
 		return (0);
 	}
 	/*
 	 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
 	 * something we sent.
 	 */
 	if (tp->t_flags & TF_NEEDSYN) {
 		/*
 		 * T/TCP: Connection was half-synchronized, and our SYN has
 		 * been ACK'd (so connection is now fully synchronized).  Go
 		 * to non-starred state, increment snd_una for ACK of SYN,
 		 * and check if we can do window scaling.
 		 */
 		tp->t_flags &= ~TF_NEEDSYN;
 		tp->snd_una++;
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			/* Send window already scaled. */
 		}
 	}
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	acked = BYTES_THIS_ACK(tp, th);
 	TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 	/*
 	 * If we just performed our first retransmit, and the ACK arrives
 	 * within our recovery window, then it was a mistake to do the
 	 * retransmit in the first place.  Recover our original cwnd and
 	 * ssthresh, and proceed to transmit where we left off.
 	 */
 	if (tp->t_flags & TF_PREVVALID) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			rack_cong_signal(tp, th, CC_RTO_ERR);
 	}
 	/*
 	 * If we have a timestamp reply, update smoothed round trip time. If
 	 * no timestamp is present but transmit timer is running and timed
 	 * sequence number was acked, update smoothed round trip time. Since
 	 * we now have an rtt measurement, cancel the timer backoff (cf.,
 	 * Phil Karn's retransmit alg.). Recompute the initial retransmit
 	 * timer.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	/*
 	 * If all outstanding data is acked, stop retransmit timer and
 	 * remember to restart (more output or persist). If there is more
 	 * data to be acked, restart retransmit timer, using current
 	 * (possibly backed-off) value.
 	 */
 	if (th->th_ack == tp->snd_max) {
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		rack->r_wanted_output++;
 	}
 	/*
 	 * If no data (only SYN) was ACK'd, skip rest of ACK processing.
 	 */
 	if (acked == 0) {
 		if (ofia)
 			*ofia = ourfinisacked;
 		return (0);
 	}
 	if (rack->r_ctl.rc_early_recovery) {
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				tcp_rack_partialack(tp, th);
 			} else {
 				rack_post_recovery(tp, th);
 				recovery = 1;
 			}
 		}
 	}
 	/*
 	 * Let the congestion control algorithm update congestion control
 	 * related information. This typically means increasing the
 	 * congestion window.
 	 */
 	rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
 	SOCKBUF_LOCK(&so->so_snd);
 	acked_amount = min(acked, (int)sbavail(&so->so_snd));
 	tp->snd_wnd -= acked_amount;
 	mfree = sbcut_locked(&so->so_snd, acked_amount);
 	if ((sbused(&so->so_snd) == 0) &&
 	    (acked > acked_amount) &&
 	    (tp->t_state >= TCPS_FIN_WAIT_1)) {
 		ourfinisacked = 1;
 	}
 	/* NB: sowwakeup_locked() does an implicit unlock. */
 	sowwakeup_locked(so);
 	m_freem(mfree);
 	if (rack->r_ctl.rc_early_recovery == 0) {
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				tcp_rack_partialack(tp, th);
 			} else {
 				rack_post_recovery(tp, th);
 			}
 		}
 	}
 	tp->snd_una = th->th_ack;
 	if (SEQ_GT(tp->snd_una, tp->snd_recover))
 		tp->snd_recover = tp->snd_una;
 
 	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
 		tp->snd_nxt = tp->snd_una;
 	}
 	if (tp->snd_una == tp->snd_max) {
 		/* Nothing left outstanding */
 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 		tp->t_acktime = 0;
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		/* Set need output so persist might get set */
 		rack->r_wanted_output++;
 		if (rack_use_sack_filter)
 			sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 		if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 		    (sbavail(&so->so_snd) == 0) &&
 		    (tp->t_flags2 & TF2_DROP_AF_DATA)) {
 			/* 
 			 * The socket was gone and the
 			 * peer sent data, time to
 			 * reset him.
 			 */
 			*ret_val = 1;
 			tp = tcp_close(tp);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen);
 			return (1);
 		}
 	}
 	if (ofia)
 		*ofia = ourfinisacked;
 	return (0);
 }
 
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	/*
 	 * Update window information. Don't look at window if no ACK: TAC's
 	 * send garbage on first SYN.
 	 */
 	int32_t nsegs;
 	int32_t tfo_syn;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		rack->r_wanted_output++;
 	} else if (thflags & TH_ACK) {
 		if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
 			tp->snd_wnd = tiwin;
 			tp->snd_wl1 = th->th_seq;
 			tp->snd_wl2 = th->th_ack;
 		}
 	}
 	/* Was persist timer active and now we have window space? */
 	if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
 		rack_exit_persist(tp, rack);
 		tp->snd_nxt = tp->snd_max;
 		/* Make sure we output to start the timer */
 		rack->r_wanted_output++;
 	}
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept random
 		 * urgent pointers, we'll crash in soreceive.  It's hard to
 		 * imagine someone actually wanting to send this much urgent
 		 * data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
 			th->th_urp = 0;	/* XXX */
 			thflags &= ~TH_URG;	/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;	/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer, then
 		 * mark the data stream.  This should not happen in
 		 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
 		 * FIN has been received from the remote side. In these
 		 * states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols), the urgent
 		 * pointer points to the last octet of urgent data.  We
 		 * continue, however, to consider it to indicate the first
 		 * octet of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = sbavail(&so->so_rcv) +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (uint32_t) tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected, pull receive urgent
 		 * pointer along with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:				/* XXX */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing
 	 * queue, and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data is
 	 * presented to the user (this happens in tcp_usrreq.c, case
 	 * PRU_RCVD).  If a FIN has already been received on this connection
 	 * then we just ignore the text.
 	 */
 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		   IS_FASTOPEN(tp->t_flags));
 	if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly
 		 * queue with control block tp.  Set thflags to whether
 		 * reassembly now includes a segment with FIN.  This handles
 		 * the common case inline (segment is the next to be
 		 * received on an established connection, and the queue is
 		 * empty), avoiding linkage into and removal from the queue
 		 * and repetition of various conversions. Set DELACK for
 		 * segments received in order, but ack immediately when
 		 * segments are out of order (so fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		    tfo_syn)) {
 			if (DELAY_ACK(tp, tlen) || tfo_syn) {
 				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				rack->r_wanted_output++;
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			TCPSTAT_ADD(tcps_rcvpack, nsegs);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m, 0);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs when
 			 * trimming from the head.
 			 */
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (tlen > 0)
 			tcp_update_sack_list(tp, save_start, save_start + tlen);
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know that the
 	 * connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized (ie NEEDSYN
 			 * flag on) then delay ACK, so it may be piggybacked
 			 * when SYN is sent. Otherwise, since we received a
 			 * FIN then no more input can be expected, send ACK
 			 * now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN) {
 				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 			/*
 			 * In SYN_RECEIVED and ESTABLISHED STATES enter the
 			 * CLOSE_WAIT state.
 			 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 			/*
 			 * If still in FIN_WAIT_1 STATE FIN has not been
 			 * acked so enter the CLOSING state.
 			 */
 		case TCPS_FIN_WAIT_1:
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 			/*
 			 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 			 * starting the time-wait timer, turning off the
 			 * other standard timers.
 			 */
 		case TCPS_FIN_WAIT_2:
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata "
 			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
 			    *ti_locked));
 			tcp_twstart(tp);
 			*ti_locked = TI_UNLOCKED;
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 			return (1);
 		}
 	}
 	if (*ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 	}
 	/*
 	 * Return any desired output.
 	 */
 	if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
 		rack->r_wanted_output++;
 	}
 	KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
 	    __func__, *ti_locked));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return (0);
 }
 
 /*
  * Here nothing is really faster, its just that we
  * have broken out the fast-data path also just like
  * the fast-ack.
  */
 static int
 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt)
 {
 	int32_t nsegs;
 	int32_t newsize = 0;	/* automatic sockbuf scaling */
 	struct tcp_rack *rack;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if (__predict_false(th->th_seq != tp->rcv_nxt)) {
 		return (0);
 	}
 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
 		return (0);
 	}
 	if (tiwin && tiwin != tp->snd_wnd) {
 		return (0);
 	}
 	if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
 		return (0);
 	}
 	if (__predict_false((to->to_flags & TOF_TS) &&
 	    (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
 		return (0);
 	}
 	if (__predict_false((th->th_ack != tp->snd_una))) {
 		return (0);
 	}
 	if (__predict_false(tlen > sbspace(&so->so_rcv))) {
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	/*
 	 * This is a pure, in-sequence data packet with nothing on the
 	 * reassembly queue and we have enough buffer space to take it.
 	 */
 	if (*ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 	}
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 
 
 	/* Clean receiver SACK report if present */
 	if (tp->rcv_numsacks)
 		tcp_clean_sackreport(tp);
 	TCPSTAT_INC(tcps_preddat);
 	tp->rcv_nxt += tlen;
 	/*
 	 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
 	 */
 	tp->snd_wl1 = th->th_seq;
 	/*
 	 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
 	 */
 	tp->rcv_up = tp->rcv_nxt;
 	TCPSTAT_ADD(tcps_rcvpack, nsegs);
 	TCPSTAT_ADD(tcps_rcvbyte, tlen);
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 	/* Add data to socket buffer. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		m_freem(m);
 	} else {
 		/*
 		 * Set new socket buffer size. Give up when limit is
 		 * reached.
 		 */
 		if (newsize)
 			if (!sbreserve_locked(&so->so_rcv,
 			    newsize, so, NULL))
 				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		sbappendstream_locked(&so->so_rcv, m, 0);
 		rack_calc_rwin(so, tp);
 	}
 	/* NB: sorwakeup_locked() does an implicit unlock. */
 	sorwakeup_locked(so);
 	if (DELAY_ACK(tp, tlen)) {
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		tp->t_flags |= TF_DELACK;
 	} else {
 		tp->t_flags |= TF_ACKNOW;
 		rack->r_wanted_output++;
 	}
 	if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	return (1);
 }
 
 /*
  * This subfunction is used to try to highly optimize the
  * fast path. We again allow window updates that are
  * in sequence to remain in the fast-path. We also add
  * in the __predict's to attempt to help the compiler.
  * Note that if we return a 0, then we can *not* process
  * it and the caller should push the packet into the
  * slow-path.
  */
 static int
 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
 {
 	int32_t acked;
 	int32_t nsegs;
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	struct tcp_rack *rack;
 
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/* Old ack, behind (or duplicate to) the last one rcv'd */
 		return (0);
 	}
 	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
 		/* Above what we have sent? */
 		return (0);
 	}
 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
 		/* We are retransmitting */
 		return (0);
 	}
 	if (__predict_false(tiwin == 0)) {
 		/* zero window */
 		return (0);
 	}
 	if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
 		/* We need a SYN or a FIN, unlikely.. */
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
 		/* Timestamp is behind .. old ack with seq wrap? */
 		return (0);
 	}
 	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
 		/* Still recovering */
 		return (0);
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_ctl.rc_sacked) {
 		/* We have sack holes on our scoreboard */
 		return (0);
 	}
 	/* Ok if we reach here, we can process a fast-ack */
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	rack_log_ack(tp, to, th);
 	/* Did the window get updated? */
 	if (tiwin != tp->snd_wnd) {
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 	}
 	if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
 		rack_exit_persist(tp, rack);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * This is a pure ack for outstanding data.
 	 */
 	if (*ti_locked == TI_RLOCKED) {
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 	}
 	TCPSTAT_INC(tcps_predack);
 
 	/*
 	 * "bad retransmit" recovery.
 	 */
 	if (tp->t_flags & TF_PREVVALID) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			rack_cong_signal(tp, th, CC_RTO_ERR);
 	}
 	/*
 	 * Recalculate the transmit timer / rtt.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	acked = BYTES_THIS_ACK(tp, th);
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 	hhook_run_tcp_est_in(tp, th, to);
 #endif
 
 	TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	TCPSTAT_ADD(tcps_rcvackbyte, acked);
 	sbdrop(&so->so_snd, acked);
 	/*
 	 * Let the congestion control algorithm update congestion control
 	 * related information. This typically means increasing the
 	 * congestion window.
 	 */
 	rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
 
 	tp->snd_una = th->th_ack;
 	/*
 	 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
 	 */
 	tp->snd_wl2 = th->th_ack;
 	tp->t_dupacks = 0;
 	m_freem(m);
 	/* ND6_HINT(tp);	 *//* Some progress has been made. */
 
 	/*
 	 * If all outstanding data are acked, stop retransmit timer,
 	 * otherwise restart timer using current (possibly backed-off)
 	 * value. If process is waiting for space, wakeup/selwakeup/signal.
 	 * If data are ready to send, let tcp_output decide between more
 	 * output or persist.
 	 */
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen,
 		    &tcp_savetcp, 0);
 #endif
 	if (tp->snd_una == tp->snd_max) {
 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 		tp->t_acktime = 0;
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	}
 	/* Wake up the socket if we have room to write more */
 	sowwakeup(so);
 	if (sbavail(&so->so_snd)) {
 		rack->r_wanted_output++;
 	}
 	return (1);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t todrop;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 	/*
 	 * If the state is SYN_SENT: if seg contains an ACK, but not for our
 	 * SYN, drop the input. if seg contains a RST, then drop the
 	 * connection. if seg does not contain SYN, then drop it. Otherwise
 	 * this is an acceptable SYN segment initialize tp->rcv_nxt and
 	 * tp->irs if seg contains ack then advance tp->snd_una if seg
 	 * contains an ECE and ECN support is enabled, the stream is ECN
 	 * capable. if SYN has been acked change to ESTABLISHED else
 	 * SYN_RCVD state arrange for segment to be acked (eventually)
 	 * continue processing rest of data/controls, beginning with URG
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
 		TCP_PROBE5(connect__refused, NULL, tp,
 		    mtod(m, const char *), tp, th);
 		tp = tcp_drop(tp, ECONNREFUSED);
 		rack_do_drop(m, tp, ti_locked);
 		return (1);
 	}
 	if (thflags & TH_RST) {
 		rack_do_drop(m, tp, ti_locked);
 		return (1);
 	}
 	if (!(thflags & TH_SYN)) {
 		rack_do_drop(m, tp, ti_locked);
 		return (1);
 	}
 	tp->irs = th->th_seq;
 	tcp_rcvseqinit(tp);
 	if (thflags & TH_ACK) {
 		int tfo_partial = 0;
 		
 		TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 #ifdef MAC
 		mac_socketpeer_set_from_mbuf(m, so);
 #endif
 		/* Do window scaling on this connection? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		tp->rcv_adv += min(tp->rcv_wnd,
 		    TCP_MAXWIN << tp->rcv_scale);
 		/*
 		 * If not all the data that was sent in the TFO SYN
 		 * has been acked, resend the remainder right away.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->snd_una != tp->snd_max)) {
 			tp->snd_nxt = th->th_ack;
 			tfo_partial = 1;
 		}
 		/*
 		 * If there's data, delay ACK; if there's also a FIN ACKNOW
 		 * will be turned on later.
 		 */
 		if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
 			rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
 					  ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
 			tp->t_flags |= TF_DELACK;
 		} else {
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
 			tp->t_flags |= TF_ACKNOW;
 		}
 
 		if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 			tp->t_flags |= TF_ECN_PERMIT;
 			TCPSTAT_INC(tcps_ecn_shs);
 		}
 		if (SEQ_GT(th->th_ack, tp->snd_una)) {
 			/* 
 			 * We advance snd_una for the 
 			 * fast open case. If th_ack is
 			 * acknowledging data beyond 
 			 * snd_una we can't just call
 			 * ack-processing since the 
 			 * data stream in our send-map
 			 * will start at snd_una + 1 (one
 			 * beyond the SYN). If its just
 			 * equal we don't need to do that
 			 * and there is no send_map.
 			 */
 			tp->snd_una++;
 		}
 		/*
 		 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
 		 * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 			thflags &= ~TH_SYN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(connect__established, NULL, tp,
 			    mtod(m, const char *), tp, th);
 			cc_conn_init(tp);
 		}
 	} else {
 		/*
 		 * Received initial SYN in SYN-SENT[*] state => simultaneous
 		 * open.  If segment contains CC option and there is a
 		 * cached CC, apply TAO test. If it succeeds, connection is *
 		 * half-synchronized. Otherwise, do 3-way handshake:
 		 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
 		 * there was no CC option, clear cached CC value.
 		 */
 		tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 		tcp_state_change(tp, TCPS_SYN_RECEIVED);
 	}
 	KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
 	    "ti_locked %d", __func__, *ti_locked));
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * Advance th->th_seq to correspond to first data byte. If data,
 	 * trim to stay within window, dropping FIN if necessary.
 	 */
 	th->th_seq++;
 	if (tlen > tp->rcv_wnd) {
 		todrop = tlen - tp->rcv_wnd;
 		m_adj(m, -todrop);
 		tlen = tp->rcv_wnd;
 		thflags &= ~TH_FIN;
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	tp->rcv_up = th->th_seq;
 	/*
 	 * Client side of transaction: already sent SYN and data. If the
 	 * remote host used T/TCP to validate the SYN, our data will be
 	 * ACK'd; if so, enter normal data segment processing in the middle
 	 * of step 5, ack processing. Otherwise, goto step 6.
 	 */
 	if (thflags & TH_ACK) {
 		if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
 			return (ret_val);
 		/* We may have changed to FIN_WAIT_1 above */
 		if (tp->t_state == TCPS_FIN_WAIT_1) {
 			/*
 			 * In FIN_WAIT_1 STATE in addition to the processing
 			 * for the ESTABLISHED state if our FIN is now
 			 * acknowledged then enter FIN_WAIT_2.
 			 */
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more data, then
 				 * closing user can proceed. Starting the
 				 * timer is contrary to the specification,
 				 * but if we don't get a FIN we'll hang
 				 * forever.
 				 *
 				 * XXXjl: we should release the tp also, and
 				 * use a compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if (IS_FASTOPEN(tp->t_flags)) {
 		/*
 		 * When a TFO connection is in SYN_RECEIVED, the
 		 * only valid packets are the initial SYN, a
 		 * retransmit/copy of the initial SYN (possibly with
 		 * a subset of the original data), a valid ACK, a
 		 * FIN, or a RST.
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
 			/* non-initial SYN is ignored */
 			struct tcp_rack *rack;
 
 			rack = (struct tcp_rack *)tp->t_fb_ptr;
 			if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
 				rack_do_drop(m, NULL, ti_locked);
 				return (0);
 			}
 		} else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know the
 	 * sequence numbers haven't wrapped.  This is a partial fix for the
 	 * "LAND" DoS attack.
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
 		rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (IS_FASTOPEN(tp->t_flags)) {
 			tp->snd_wnd = tiwin;
 			cc_conn_init(tp);
 		}
 		return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 		    ti_locked, tiwin, thflags, nxt_pkt));
 	}
 	TCPSTAT_INC(tcps_connects);
 	soisconnected(so);
 	/* Do window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_wnd = tiwin;
 	}
 	/*
 	 * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
 	 * FIN-WAIT-1
 	 */
 	tp->t_starttime = ticks;
 	if (tp->t_flags & TF_NEEDFIN) {
 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
 		tp->t_flags &= ~TF_NEEDFIN;
 	} else {
 		tcp_state_change(tp, TCPS_ESTABLISHED);
 		TCP_PROBE5(accept__established, NULL, tp,
 		    mtod(m, const char *), tp, th);
 		if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
 			tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 			tp->t_tfo_pending = NULL;
 
 			/*
 			 * Account for the ACK of our SYN prior to regular
 			 * ACK processing below.
 			 */
 			tp->snd_una++;
 		}
 		/*
 		 * TFO connections call cc_conn_init() during SYN
 		 * processing.  Calling it again here for such connections
 		 * is not harmless as it would undo the snd_cwnd reduction
 		 * that occurs when a TFO SYN|ACK is retransmitted.
 		 */
 		if (!IS_FASTOPEN(tp->t_flags))
 			cc_conn_init(tp);
 	}
 	/*
 	 * If segment contains data or ACK, will call tcp_reass() later; if
 	 * not, do so now to pass queued data to user.
 	 */
 	if (tlen == 0 && (thflags & TH_FIN) == 0)
 		(void)tcp_reass(tp, (struct tcphdr *)0, 0,
 		    (struct mbuf *)0);
 	tp->snd_wl1 = th->th_seq - 1;
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (tp->t_state == TCPS_FIN_WAIT_1) {
 		/* We could have went to FIN_WAIT_1 (or EST) above */
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing for the
 		 * ESTABLISHED state if our FIN is now acknowledged then
 		 * enter FIN_WAIT_2.
 		 */
 		if (ourfinisacked) {
 			/*
 			 * If we can't receive any more data, then closing
 			 * user can proceed. Starting the timer is contrary
 			 * to the specification, but if we don't get a FIN
 			 * we'll hang forever.
 			 *
 			 * XXXjl: we should release the tp also, and use a
 			 * compressed state.
 			 */
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				soisdisconnected(so);
 				tcp_timer_activate(tp, TT_2MSL,
 				    (tcp_fast_finwait2_recycle ?
 				    tcp_finwait2_timeout :
 				    TP_MAXIDLE(tp)));
 			}
 			tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 
 	/*
 	 * Header prediction: check for the two common cases of a
 	 * uni-directional data xfer.  If the packet has no control flags,
 	 * is in-sequence, the window didn't change and we're not
 	 * retransmitting, it's a candidate.  If the length is zero and the
 	 * ack moved forward, we're the sender side of the xfer.  Just free
 	 * the data acked & wake any higher level process that was blocked
 	 * waiting for space.  If the length is non-zero and the ack didn't
 	 * move, we're the receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data toc The socket
 	 * buffer and note that we need a delayed ack. Make sure that the
 	 * hidden state-flags are also off. Since we check for
 	 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
 	 */
 	if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
 	    __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
 	    __predict_true(LIST_EMPTY(&tp->t_segq)) &&
 	    __predict_true(th->th_seq == tp->rcv_nxt)) {
 		struct tcp_rack *rack;
 
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		if (tlen == 0) {
 			if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
 			    ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
 				return (0);
 			}
 		} else {
 			if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
 			    ti_locked, tiwin, nxt_pkt)) {
 				return (0);
 			}
 		}
 	}
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    ti_locked, tiwin, thflags, nxt_pkt));
 
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	/* State changes only happen in rack_process_data() */
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 
 	rack_calc_rwin(so, tp);
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    ti_locked, tiwin, thflags, nxt_pkt));
 
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 static int
 rack_check_data_after_close(struct mbuf *m, 
     struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so)
 {
 	struct tcp_rack *rack; 
 
 	KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
 					   "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked));
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->rc_allow_data_af_clo == 0) {
 	close_now:
 		tp = tcp_close(tp);
 		TCPSTAT_INC(tcps_rcvafterclose);
 		rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen));
 		return (1);
 	}
 	if (sbavail(&so->so_snd) == 0)
 		goto close_now;
 	/* Ok we allow data that is ignored and a followup reset */
 	tp->rcv_nxt = th->th_seq + *tlen;
 	tp->t_flags2 |= TF2_DROP_AF_DATA;
 	rack->r_wanted_output = 1;
 	*tlen = 0;
 	return (0);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) && tlen) {
 		if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    ti_locked, tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		/*
 		 * If we can't receive any more data, then closing user can
 		 * proceed. Starting the timer is contrary to the
 		 * specification, but if we don't get a FIN we'll hang
 		 * forever.
 		 *
 		 * XXXjl: we should release the tp also, and use a
 		 * compressed state.
 		 */
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			soisdisconnected(so);
 			tcp_timer_activate(tp, TT_2MSL,
 			    (tcp_fast_finwait2_recycle ?
 			    tcp_finwait2_timeout :
 			    TP_MAXIDLE(tp)));
 		}
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) && tlen) {
 		if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    ti_locked, tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tcp_twstart(tp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		*ti_locked = TI_UNLOCKED;
 		m_freem(m);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) && tlen) {
 		if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    ti_locked, tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	/*
 	 * case TCPS_LAST_ACK: Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tp = tcp_close(tp);
 		rack_do_drop(m, tp, ti_locked);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp, ti_locked));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, ti_locked, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tlen) {
 		if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    ti_locked, tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL, ti_locked);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    ti_locked, tiwin, thflags, nxt_pkt));
 }
 
 
 static void inline
 rack_clear_rate_sample(struct tcp_rack *rack)
 {
 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
 	rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
 	rack->r_ctl.rack_rs.rs_rtt_tot = 0;
 }
 
 static int
 rack_init(struct tcpcb *tp)
 {
 	struct tcp_rack *rack = NULL;
 
 	tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
 	if (tp->t_fb_ptr == NULL) {
 		/*
 		 * We need to allocate memory but cant. The INP and INP_INFO
 		 * locks and they are recusive (happens during setup. So a
 		 * scheme to drop the locks fails :(
 		 *
 		 */
 		return (ENOMEM);
 	}
 	memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	TAILQ_INIT(&rack->r_ctl.rc_map);
 	TAILQ_INIT(&rack->r_ctl.rc_free);
 	TAILQ_INIT(&rack->r_ctl.rc_tmap);
 	rack->rc_tp = tp;
 	if (tp->t_inpcb) {
 		rack->rc_inp = tp->t_inpcb;
 	}
 	/* Probably not needed but lets be sure */
 	rack_clear_rate_sample(rack);
 	rack->r_cpu = 0;
 	rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
 	rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
 	rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
 	rack->rc_pace_reduce = rack_slot_reduction;
 	if (V_tcp_delack_enabled)
 		tp->t_delayed_ack = 1;
 	else
 		tp->t_delayed_ack = 0;
 	rack->rc_pace_max_segs = rack_hptsi_segments;
 	rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
 	rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
 	rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
 	rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
 	rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
 	rack->r_enforce_min_pace = rack_min_pace_time;
 	rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
 	rack->r_ctl.rc_prop_rate = rack_proportional_rate;
 	rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
 	rack->r_ctl.rc_early_recovery = rack_early_recovery;
 	rack->rc_always_pace = rack_pace_every_seg;
 	rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
 	rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
 	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
 	rack->r_ctl.rc_min_to = rack_min_to;
 	rack->r_ctl.rc_prr_inc_var = rack_inc_var;
 	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 	if (tp->snd_una != tp->snd_max) {
 		/* Create a send map for the current outstanding data */
 		struct rack_sendmap *rsm;
 
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 			tp->t_fb_ptr = NULL;
 			return (ENOMEM);
 		}
 		rsm->r_flags = RACK_OVERMAX;
 		rsm->r_tim_lastsent[0] = tcp_ts_getticks();
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		rsm->r_start = tp->snd_una;
 		rsm->r_end = tp->snd_max;
 		rsm->r_sndcnt = 0;
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 	}
 	return (0);
 }
 
 static int
 rack_handoff_ok(struct tcpcb *tp)
 {
 	if ((tp->t_state == TCPS_CLOSED) ||
 	    (tp->t_state == TCPS_LISTEN)) {
 		/* Sure no problem though it may not stick */
 		return (0);
 	}
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED)) {
 		/*
 		 * We really don't know you have to get to ESTAB or beyond
 		 * to tell.
 		 */
 		return (EAGAIN);
 	}
 	if (tp->t_flags & TF_SACK_PERMIT) {
 		return (0);
 	}
 	/*
 	 * If we reach here we don't do SACK on this connection so we can
 	 * never do rack.
 	 */
 	return (EINVAL);
 }
 
 static void
 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 {
 	if (tp->t_fb_ptr) {
 		struct tcp_rack *rack;
 		struct rack_sendmap *rsm;
 
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 #ifdef TCP_BLACKBOX
 		tcp_log_flowend(tp);
 #endif
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 		while (rsm) {
 			TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
 			uma_zfree(rack_zone, rsm);
 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 		}
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		while (rsm) {
 			TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
 			uma_zfree(rack_zone, rsm);
 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		}
 		rack->rc_free_cnt = 0;
 		uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 		tp->t_fb_ptr = NULL;
 	}
 }
 
 static void
 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	switch (tp->t_state) {
 	case TCPS_SYN_SENT:
 		rack->r_state = TCPS_SYN_SENT;
 		rack->r_substate = rack_do_syn_sent;
 		break;
 	case TCPS_SYN_RECEIVED:
 		rack->r_state = TCPS_SYN_RECEIVED;
 		rack->r_substate = rack_do_syn_recv;
 		break;
 	case TCPS_ESTABLISHED:
 		rack->r_state = TCPS_ESTABLISHED;
 		rack->r_substate = rack_do_established;
 		break;
 	case TCPS_CLOSE_WAIT:
 		rack->r_state = TCPS_CLOSE_WAIT;
 		rack->r_substate = rack_do_close_wait;
 		break;
 	case TCPS_FIN_WAIT_1:
 		rack->r_state = TCPS_FIN_WAIT_1;
 		rack->r_substate = rack_do_fin_wait_1;
 		break;
 	case TCPS_CLOSING:
 		rack->r_state = TCPS_CLOSING;
 		rack->r_substate = rack_do_closing;
 		break;
 	case TCPS_LAST_ACK:
 		rack->r_state = TCPS_LAST_ACK;
 		rack->r_substate = rack_do_lastack;
 		break;
 	case TCPS_FIN_WAIT_2:
 		rack->r_state = TCPS_FIN_WAIT_2;
 		rack->r_substate = rack_do_fin_wait_2;
 		break;
 	case TCPS_LISTEN:
 	case TCPS_CLOSED:
 	case TCPS_TIME_WAIT:
 	default:
 #ifdef INVARIANTS
 		panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
 #endif
 		break;
 	};
 }
 
 
 static void
 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
 {
 	/*
 	 * We received an ack, and then did not
 	 * call send or were bounced out due to the
 	 * hpts was running. Now a timer is up as well, is
 	 * it the right timer?
 	 */
 	struct rack_sendmap *rsm;
 	int tmr_up;
 	
 	tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 	if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
 		return;
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
 	    (tmr_up == PACE_TMR_RXT)) {
 		/* Should be an RXT */
 		return;
 	}
 	if (rsm == NULL) {
 		/* Nothing outstanding? */
 		if (tp->t_flags & TF_DELACK) {
 			if (tmr_up == PACE_TMR_DELACK)
 				/* We are supposed to have delayed ack up and we do */
 				return;
 		} else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
 			/* 
 			 * if we hit enobufs then we would expect the possiblity
 			 * of nothing outstanding and the RXT up (and the hptsi timer).
 			 */
 			return;
 		} else if (((tcp_always_keepalive ||
 			     rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 			    (tp->t_state <= TCPS_CLOSING)) &&
 			   (tmr_up == PACE_TMR_KEEP) &&
 			   (tp->snd_max == tp->snd_una)) {
 			/* We should have keep alive up and we do */
 			return;
 		}
 	}
 	if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & RACK_HAS_FIN)) {
 			/* needs to be a RXT */
 			if (tmr_up == PACE_TMR_RXT)
 				return;
 		} else if (tmr_up == PACE_TMR_RACK)
 			return;
 	} else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
 		   ((tmr_up == PACE_TMR_TLP) ||
 		    (tmr_up == PACE_TMR_RXT))) {
 		/* 
 		 * Either a TLP or RXT is fine if no sack-passed 
 		 * is in place and data is outstanding.
 		 */
 		return;
 	} else if (tmr_up == PACE_TMR_DELACK) {
 		/*
 		 * If the delayed ack was going to go off
 		 * before the rtx/tlp/rack timer were going to
 		 * expire, then that would be the timer in control.
 		 * Note we don't check the time here trusting the
 		 * code is correct.
 		 */
 		return;
 	}
 	/* 
 	 * Ok the timer originally started is not what we want now.
 	 * We will force the hpts to be stopped if any, and restart
 	 * with the slot set to what was in the saved slot.
 	 */
 	rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 }
 
 static void
 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
     int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv)
 {
 	int32_t thflags, retval, did_out = 0;
 	int32_t way_out = 0;
 	uint32_t cts;
 	uint32_t tiwin;
 	struct tcpopt to;
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	int32_t prev_state = 0;
 
 	cts = tcp_tv_to_mssectick(tv);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 
 	kern_prefetch(rack, &prev_state);
 	prev_state = 0;
 	thflags = th->th_flags;
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a read lock on tcbinfo.  Otherwise, we
 	 * allow the tcbinfo to be in either locked or unlocked, as the
 	 * caller may have unnecessarily acquired a lock due to a race.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tp->t_state != TCPS_ESTABLISHED) {
 		KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
 		    "SYN/FIN/RST/!EST", __func__, ti_locked));
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	} else {
 #ifdef INVARIANTS
 		if (ti_locked == TI_RLOCKED) {
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		} else {
 			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
 			    "ti_locked: %d", __func__, ti_locked));
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		}
 #endif
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 	{
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
 		    tlen, &log, true);
 	}
 	/*
 	 * Segment received on connection. Reset idle time and keep-alive
 	 * timer. XXX: This should be done after segment validation to
 	 * ignore broken/spoofed segs.
 	 */
 	if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
 #ifdef NETFLIX_CWV
 		if ((tp->cwv_enabled) &&
 		    ((tp->cwv_cwnd_valid == 0) &&
 		     TCPS_HAVEESTABLISHED(tp->t_state) &&
 		     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
 			tcp_newcwv_nvp_closedown(tp);
 		} else 
 #endif
 		       if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
 			counter_u64_add(rack_input_idle_reduces, 1);
 			rack_cc_after_idle(tp,
 			    (rack->r_idle_reduce_largest ? 1 :0));
 		}
 	}
 	rack->r_ctl.rc_rcvtime = cts;
 	tp->t_rcvtime = ticks;
 
 #ifdef NETFLIX_CWV
 	if (tp->cwv_enabled) {
 		if ((tp->cwv_cwnd_valid == 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
 			tcp_newcwv_nvp_closedown(tp);
 	}
 #endif
 	/*
 	 * Unscale the window into a 32-bit value. For the SYN_SENT state
 	 * the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 #ifdef NETFLIX_STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
 #endif
 	/*
 	 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
 	 * this to occur after we've validated the segment.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			TCPSTAT_INC(tcps_ecn_ce);
 			break;
 		case IPTOS_ECN_ECT0:
 			TCPSTAT_INC(tcps_ecn_ect0);
 			break;
 		case IPTOS_ECN_ECT1:
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
 		/* Congestion experienced. */
 		if (thflags & TH_ECE) {
 			rack_cong_signal(tp, th, CC_ECN);
 		}
 	}
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If echoed timestamp is later than the current time, fall back to
 	 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
 	 * were used when this connection was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, cts))
 			to.to_tsecr = 0;
 	}
 	/*
 	 * If its the first time in we need to take care of options and
 	 * verify we can do SACK for rack!
 	 */
 	if (rack->r_state == 0) {
 		/* Should be init'd by rack_init() */
 		KASSERT(rack->rc_inp != NULL,
 		    ("%s: rack->rc_inp unexpectedly NULL", __func__));
 		if (rack->rc_inp == NULL) {
 			rack->rc_inp = tp->t_inpcb;
 		}
 
 		/*
 		 * Process options only when we get SYN/ACK back. The SYN
 		 * case for incoming connections is handled in tcp_syncache.
 		 * According to RFC1323 the window field in a SYN (i.e., a
 		 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
 		 * this is traditional behavior, may need to be cleaned up.
 		 */
 		rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
 		if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 			if ((to.to_flags & TOF_SCALE) &&
 			    (tp->t_flags & TF_REQ_SCALE)) {
 				tp->t_flags |= TF_RCVD_SCALE;
 				tp->snd_scale = to.to_wscale;
 			}
 			/*
 			 * Initial send window.  It will be updated with the
 			 * next incoming segment to the scaled value.
 			 */
 			tp->snd_wnd = th->th_win;
 			if (to.to_flags & TOF_TS) {
 				tp->t_flags |= TF_RCVD_TSTMP;
 				tp->ts_recent = to.to_tsval;
 				tp->ts_recent_age = cts;
 			}
 			if (to.to_flags & TOF_MSS)
 				tcp_mss(tp, to.to_mss);
 			if ((tp->t_flags & TF_SACK_PERMIT) &&
 			    (to.to_flags & TOF_SACKPERM) == 0)
 				tp->t_flags &= ~TF_SACK_PERMIT;
 		}
 		/*
 		 * At this point we are at the initial call. Here we decide
 		 * if we are doing RACK or not. We do this by seeing if
 		 * TF_SACK_PERMIT is set, if not rack is *not* possible and
 		 * we switch to the default code.
 		 */
 		if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
 			tcp_switch_back_to_default(tp);
 			(*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
 			    tlen, iptos, ti_locked);
 			return;
 		}
 		/* Set the flag */
 		rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 		tcp_set_hpts(tp->t_inpcb);
 		rack_stop_all_timers(tp);
 		sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
 	}
 	/*
 	 * This is the one exception case where we set the rack state
 	 * always. All other times (timers etc) we must have a rack-state
 	 * set (so we assure we have done the checks above for SACK).
 	 */
 	if (rack->r_state != tp->t_state)
 		rack_set_state(tp, rack);
 	if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
 		kern_prefetch(rsm, &prev_state);
 	prev_state = rack->r_state;
 	rack->r_ctl.rc_tlp_send_cnt = 0;
 	rack_clear_rate_sample(rack);
 	retval = (*rack->r_substate) (m, th, so,
 	    tp, &to, drop_hdrlen,
 	    tlen, &ti_locked, tiwin, thflags, nxt_pkt);
 #ifdef INVARIANTS
 	if ((retval == 0) &&
 	    (tp->t_inpcb == NULL)) {
 		panic("retval:%d tp:%p t_inpcb:NULL state:%d",
 		    retval, tp, prev_state);
 	}
 #endif
 	if (ti_locked != TI_UNLOCKED) {
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 	if (retval == 0) {
 		/*
 		 * If retval is 1 the tcb is unlocked and most likely the tp
 		 * is gone.
 		 */
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 		tcp_rack_xmit_timer_commit(rack, tp);
 		if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
 		    (rack->rc_in_persist == 0)){
 			/* 
 			 * The peer shrunk its window on us to the point
 			 * where we have sent too much. The only thing
 			 * we can do here is stop any timers and
 			 * enter persist. We most likely lost the last
 			 * bytes we sent but oh well, we will have to
 			 * retransmit them after the peer is caught up.
 			 */
 			if (rack->rc_inp->inp_in_hpts)
 				tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 			rack_timer_cancel(tp, rack, cts, __LINE__);
 			rack_enter_persist(tp, rack, cts);
 			rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 			way_out = 3;
 			goto done_with_input;
 		}
 		if (nxt_pkt == 0) {
 			if (rack->r_wanted_output != 0) {
 				did_out = 1;
 				(void)tp->t_fb->tfb_tcp_output(tp);
 			}
 			rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
 		}
 		if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
 		    (SEQ_GT(tp->snd_max, tp->snd_una) ||
 		     (tp->t_flags & TF_DELACK) ||
 		     ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		      (tp->t_state <= TCPS_CLOSING)))) {
 			/* We could not send (probably in the hpts but stopped the timer earlier)? */
 			if ((tp->snd_max == tp->snd_una) &&
 			    ((tp->t_flags & TF_DELACK) == 0) &&
 			    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 				/* keep alive not needed if we are hptsi output yet */
 				;
 			} else {
 				if (rack->rc_inp->inp_in_hpts)
 					tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 				rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 			}
 			way_out = 1;
 		} else {
 			/* Do we have the correct timer running? */
 			rack_timer_audit(tp, rack, &so->so_snd);
 			way_out = 2;
 		}
 	done_with_input:
 		rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
 		if (did_out)
 			rack->r_wanted_output = 0;
 #ifdef INVARIANTS
 		if (tp->t_inpcb == NULL) {
 			panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
 			      did_out,
 			      retval, tp, prev_state);
 		}
 #endif
 		INP_WUNLOCK(tp->t_inpcb);
 	}
 }
 
 void
 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
     int32_t ti_locked)
 {
 	struct timeval tv;
 #ifdef RSS
 	struct tcp_function_block *tfb;
 	struct tcp_rack *rack;
 	struct inpcb *inp;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_state == 0) {
 		/*
 		 * Initial input (ACK to SYN-ACK etc)lets go ahead and get
 		 * it processed
 		 */
-		if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo))
-			ti_locked = TI_RLOCKED;
-		if (ti_locked != TI_RLOCKED) {
-			inp = tp->t_inpcb;
-			tfb = tp->t_fb;
-			in_pcbref(inp);
-			INP_WUNLOCK(inp);
-			INP_INFO_RLOCK(&V_tcbinfo);
-			ti_locked = TI_RLOCKED;
-			INP_WLOCK(inp);
-			if (in_pcbrele_wlocked(inp))
-				inp = NULL;
-			if (inp == NULL || (inp->inp_flags2 & INP_FREED) ||
-			    (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
-				/* The TCPCB went away. Free the packet. */
-				INP_INFO_RUNLOCK(&V_tcbinfo);
-				if (inp)
-					INP_WUNLOCK(inp);
-				m_freem(m);
-				return;
-			}
-			/* If the stack changed, call the correct stack. */
-			if (tp->t_fb != tfb) {
-				tp->t_fb->tfb_tcp_do_segment(m, th, so, tp,
-				    drop_hdrlen, tlen, iptos, ti_locked);
-				return;
-			}
-		}
+		INP_INFO_RLOCK();
+		ti_locked = TI_RLOCKED;
 		tcp_get_usecs(&tv);
 		rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
 		    tlen, iptos, ti_locked, 0, &tv);
 		return;
 	}
 	if (ti_locked == TI_RLOCKED)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked);
 	INP_WUNLOCK(tp->t_inpcb);
 #else
 	tcp_get_usecs(&tv);
 	rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
 	    tlen, iptos, ti_locked, 0, &tv);
 #endif
 }
 
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
 {
 	struct rack_sendmap *rsm = NULL;
 	int32_t idx;
 	uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
 
 	/* Return the next guy to be re-transmitted */
 	if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
 		return (NULL);
 	}
 	if (tp->t_flags & TF_SENTFIN) {
 		/* retran the end FIN? */
 		return (NULL);
 	}
 	/* ok lets look at this one */
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
 		goto check_it;
 	}
 	rsm = rack_find_lowest_rsm(rack);
 	if (rsm == NULL) {
 		return (NULL);
 	}
 check_it:
 	srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
 	srtt = TICKS_2_MSEC(srtt_cur);
 	if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
 		srtt = rack->rc_rack_rtt;
 	if (rsm->r_flags & RACK_ACKED) {
 		return (NULL);
 	}
 	if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
 		/* Its not yet ready */
 		return (NULL);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	ts_low = rsm->r_tim_lastsent[idx];
 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
 	if (tsused <= ts_low) {
 		return (NULL);
 	}
 	if ((tsused - ts_low) >= thresh) {
 		return (rsm);
 	}
 	return (NULL);
 }
 
 static int
 rack_output(struct tcpcb *tp)
 {
 	struct socket *so;
 	uint32_t recwin, sendwin;
 	uint32_t sb_offset;
 	int32_t len, flags, error = 0;
 	struct mbuf *m;
 	struct mbuf *mb;
 	uint32_t if_hw_tsomaxsegcount = 0;
 	uint32_t if_hw_tsomaxsegsize;
 	long tot_len_this_send = 0;
 	struct ip *ip = NULL;
 #ifdef TCPDEBUG
 	struct ipovly *ipov = NULL;
 #endif
 	struct udphdr *udp = NULL;
 	struct tcp_rack *rack;
 	struct tcphdr *th;
 	uint8_t pass = 0;
 	uint8_t wanted_cookie = 0;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen, ulen=0;
 	uint32_t rack_seq;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	unsigned ipsec_optlen = 0;
 
 #endif
 	int32_t idle, sendalot;
 	int32_t sub_from_prr = 0;
 	volatile int32_t sack_rxmit;
 	struct rack_sendmap *rsm = NULL;
 	int32_t tso, mtu, would_have_fin = 0;
 	struct tcpopt to;
 	int32_t slot = 0;
 	uint32_t cts;
 	uint8_t hpts_calling, doing_tlp = 0;
 	int32_t do_a_prefetch;
 	int32_t prefetch_rsm = 0;
 	int32_t prefetch_so_done = 0;
 	struct tcp_log_buffer *lgb = NULL;
 	struct inpcb *inp;
 	struct sockbuf *sb;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int32_t isipv6;
 #endif
 	/* setup and take the cache hits here */
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	inp = rack->rc_inp;
 	so = inp->inp_socket;
 	sb = &so->so_snd;
 	kern_prefetch(sb, &do_a_prefetch);
 	do_a_prefetch = 1;
 	
 	INP_WLOCK_ASSERT(inp);
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 	/*
 	 * For TFO connections in SYN_RECEIVED, only allow the initial
 	 * SYN|ACK and those sent by the retransmit timer.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (tp->t_state == TCPS_SYN_RECEIVED) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
 	    (rack->r_ctl.rc_resend == NULL))         /* not a retransmit */
 		return (0);
 #ifdef INET6
 	if (rack->r_state) {
 		/* Use the cache line loaded if possible */
 		isipv6 = rack->r_is_v6;
 	} else {
 		isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	}
 #endif
 	cts = tcp_ts_getticks();
 	if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
 	    inp->inp_in_hpts) {
 		/*
 		 * We are on the hpts for some timer but not hptsi output.
 		 * Remove from the hpts unconditionally.
 		 */
 		rack_timer_cancel(tp, rack, cts, __LINE__);
 	}
 	/* Mark that we have called rack_output(). */
 	if ((rack->r_timer_override) ||
 	    (tp->t_flags & TF_FORCEDATA) ||
 	    (tp->t_state < TCPS_ESTABLISHED)) {
 		if (tp->t_inpcb->inp_in_hpts)
 			tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
 	} else if (tp->t_inpcb->inp_in_hpts) {
 		/*
 		 * On the hpts you can't pass even if ACKNOW is on, we will
 		 * when the hpts fires.
 		 */
 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
 		return (0);
 	}
 	hpts_calling = inp->inp_hpts_calls;
 	inp->inp_hpts_calls = 0;
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		if (rack_process_timers(tp, rack, cts, hpts_calling)) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
 			return (0);
 		}
 	}
 	rack->r_wanted_output = 0;
 	rack->r_timer_override = 0;
 	/*
 	 * Determine length of data that should be transmitted, and flags
 	 * that will be used. If there is some data or critical controls
 	 * (SYN, RST) to send, then transmit; otherwise, investigate
 	 * further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 #ifdef NETFLIX_CWV
 	if (tp->cwv_enabled) {
 		if ((tp->cwv_cwnd_valid == 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
 			tcp_newcwv_nvp_closedown(tp);
 	} else
 #endif
 	if (tp->t_idle_reduce) {
 		if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
 			rack_cc_after_idle(tp,
 		            (rack->r_idle_reduce_largest ? 1 :0));
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	sendalot = 0;
 	cts = tcp_ts_getticks();
 	tso = 0;
 	mtu = 0;
 	sb_offset = tp->snd_max - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly
 	 * trying to send out new data (when sendalot is 1), bypass this
 	 * function. If we retransmit in fast recovery mode, decrement
 	 * snd_cwnd, since we're replacing a (future) new transmission with
 	 * a retransmission now, and we previously incremented snd_cwnd in
 	 * tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	while (rack->rc_free_cnt < rack_free_cache) {
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			if (inp->inp_hpts_calls)
 				/* Retry in a ms */
 				slot = 1;
 			goto just_return_nolock;
 		}
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
 		rack->rc_free_cnt++;
 		rsm = NULL;
 	}
 	if (inp->inp_hpts_calls)
 		inp->inp_hpts_calls = 0;
 	sack_rxmit = 0;
 	len = 0;
 	rsm = NULL;
 	if (flags & TH_RST) {
 		SOCKBUF_LOCK(sb);
 		goto send;
 	}
 	if (rack->r_ctl.rc_tlpsend) {
 		/* Tail loss probe */
 		long cwin;
 		long tlen;
 
 		doing_tlp = 1;
 		rsm = rack->r_ctl.rc_tlpsend;
 		rack->r_ctl.rc_tlpsend = NULL;
 		sack_rxmit = 1;
 		tlen = rsm->r_end - rsm->r_start;
 		if (tlen > tp->t_maxseg)
 			tlen = tp->t_maxseg;
 #ifdef INVARIANTS
 		if (SEQ_GT(tp->snd_una, rsm->r_start)) {
 			panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
 			    tp, rack, tp->snd_una, rsm, rsm->r_start);
 		}
 #endif
 		sb_offset = rsm->r_start - tp->snd_una;
 		cwin = min(tp->snd_wnd, tlen);
 		len = cwin;
 	} else if (rack->r_ctl.rc_resend) {
 		/* Retransmit timer */
 		rsm = rack->r_ctl.rc_resend;
 		rack->r_ctl.rc_resend = NULL;
 		len = rsm->r_end - rsm->r_start;
 		sack_rxmit = 1;
 		sendalot = 0;
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (len >= tp->t_maxseg) {
 			len = tp->t_maxseg;
 		}
 		KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
 		    __func__, sb_offset));
 	} else if ((rack->rc_in_persist == 0) &&
 	    ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
 		long tlen;
 
 		if ((!IN_RECOVERY(tp->t_flags)) &&
 		    ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
 			/* Enter recovery if not induced by a time-out */
 			rack->r_ctl.rc_rsm_start = rsm->r_start;
 			rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 			rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 			rack_cong_signal(tp, NULL, CC_NDUPACK);
 			/*
 			 * When we enter recovery we need to assure we send
 			 * one packet.
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 		}
 #ifdef INVARIANTS
 		if (SEQ_LT(rsm->r_start, tp->snd_una)) {
 			panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
 			    tp, rack, rsm, rsm->r_start, tp->snd_una);
 		}
 #endif
 		tlen = rsm->r_end - rsm->r_start;
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (tlen > rack->r_ctl.rc_prr_sndcnt) {
 			len = rack->r_ctl.rc_prr_sndcnt;
 		} else {
 			len = tlen;
 		}
 		if (len >= tp->t_maxseg) {
 			sendalot = 1;
 			len = tp->t_maxseg;
 		} else {
 			sendalot = 0;
 			if ((rack->rc_timer_up == 0) &&
 			    (len < tlen)) {
 				/*
 				 * If its not a timer don't send a partial
 				 * segment.
 				 */
 				len = 0;
 				goto just_return_nolock;
 			}
 		}
 		KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
 		    __func__, sb_offset));
 		if (len > 0) {
 			sub_from_prr = 1;
 			sack_rxmit = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 			counter_u64_add(rack_rtm_prr_retran, 1);
 		}
 	}
 	if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
 		/* we are retransmitting the fin */
 		len--;
 		if (len) {
 			/*
 			 * When retransmitting data do *not* include the
 			 * FIN. This could happen from a TLP probe.
 			 */
 			flags &= ~TH_FIN;
 		}
 	}
 #ifdef INVARIANTS
 	/* For debugging */
 	rack->r_ctl.rc_rsm_at_retran = rsm;
 #endif
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 	if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
 		void *end_rsm;
 		end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
 		if (end_rsm)
 			kern_prefetch(end_rsm, &prefetch_rsm);
 		prefetch_rsm = 1;
 	}
 	SOCKBUF_LOCK(sb);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte. Otherwise,
 	 * if window is small but nonzero and time TF_SENTFIN expired, we
 	 * will send what we can and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then clear
 			 * the FIN bit.  Usually this would happen below
 			 * when it realizes that we aren't sending all the
 			 * data.  However, if we have exactly 1 byte of
 			 * unsent data, then it won't clear the FIN bit
 			 * below, and if we are in persist state, we wind up
 			 * sending the packet without recording that we sent
 			 * the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit, because
 			 * if we don't have any more data to send then the
 			 * probe will be the FIN itself.
 			 */
 			if (sb_offset < sbused(sb))
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			if (rack->rc_in_persist)
 				rack_exit_persist(tp, rack);
 			/*
 			 * If we are dropping persist mode then we need to
 			 * correct snd_nxt/snd_max and off.
 			 */
 			tp->snd_nxt = tp->snd_max;
 			sb_offset = tp->snd_nxt - tp->snd_una;
 		}
 	}
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
 	 * negative length.  This can also occur when TCP opens up its
 	 * congestion window while receiving additional duplicate acks after
 	 * fast-retransmit because TCP will reset snd_nxt to snd_max after
 	 * the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will be
 	 * set to snd_una, the sb_offset will be 0, and the length may wind
 	 * up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		uint32_t avail;
 
 		avail = sbavail(sb);
 		if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
 			sb_offset = tp->snd_nxt - tp->snd_una;
 		else
 			sb_offset = 0;
 		if (IN_RECOVERY(tp->t_flags) == 0) {
 			if (rack->r_ctl.rc_tlp_new_data) {
 				/* TLP is forcing out new data */
 				if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
 					rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
 				}
 				if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
 					len = tp->snd_wnd;
 				else
 					len = rack->r_ctl.rc_tlp_new_data;
 				rack->r_ctl.rc_tlp_new_data = 0;
 				doing_tlp = 1;
 			} else {
 				if (sendwin > avail) {
 					/* use the available */
 					if (avail > sb_offset) {
 						len = (int32_t)(avail - sb_offset);
 					} else {
 						len = 0;
 					}
 				} else {
 					if (sendwin > sb_offset) {
 						len = (int32_t)(sendwin - sb_offset);
 					} else {
 						len = 0;
 					}
 				}
 			}
 		} else {
 			uint32_t outstanding;
 
 			/*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible so far in the scoreboard.
 			 */
 			outstanding = tp->snd_max - tp->snd_una;
 			if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
 				len = 0;
 			else if (avail > sb_offset)
 				len = avail - sb_offset;
 			else
 				len = 0;
 			if (len > 0) {
 				if (len > rack->r_ctl.rc_prr_sndcnt)
 					len = rack->r_ctl.rc_prr_sndcnt;
 
 				if (len > 0) {
 					sub_from_prr = 1;
 					counter_u64_add(rack_rtm_prr_newdata, 1);
 				}
 			}
 			if (len > tp->t_maxseg) {
 				/*
 				 * We should never send more than a MSS when
 				 * retransmitting or sending new data in prr
 				 * mode unless the override flag is on. Most
 				 * likely the PRR algorithm is not going to
 				 * let us send a lot as well :-)
 				 */
 				if (rack->r_ctl.rc_prr_sendalot == 0)
 					len = tp->t_maxseg;
 			} else if (len < tp->t_maxseg) {
 				/*
 				 * Do we send any? The idea here is if the
 				 * send empty's the socket buffer we want to
 				 * do it. However if not then lets just wait
 				 * for our prr_sndcnt to get bigger.
 				 */
 				long leftinsb;
 
 				leftinsb = sbavail(sb) - sb_offset;
 				if (leftinsb > len) {
 					/* This send does not empty the sb */
 					len = 0;
 				}
 			}
 		}
 	}
 	if (prefetch_so_done == 0) {
 		kern_prefetch(so, &prefetch_so_done);
 		prefetch_so_done = 1;
 	}
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this is
 	 * SYN-SENT state and if segment contains data and if we don't know
 	 * that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
 	    ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		/*
 		 * When sending additional segments following a TFO SYN|ACK,
 		 * do not include the SYN bit.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->t_state == TCPS_SYN_RECEIVED))
 			flags &= ~TH_SYN;
 		sb_offset--, len++;
 	}
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments. This
 	 * measure is needed to prevent interoperability problems with not
 	 * fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 	/*
 	 * On TFO sockets, ensure no data is sent in the following cases:
 	 *
 	 *  - When retransmitting SYN|ACK on a passively-created socket
 	 *
 	 *  - When retransmitting SYN on an actively created socket
 	 *
 	 *  - When sending a zero-length cookie (cookie request) on an
 	 *    actively created socket
 	 *
 	 *  - When the socket is in the CLOSED state (RST is being sent)
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
 	     ((tp->t_state == TCPS_SYN_SENT) &&
 	      (tp->t_tfo_client_cookie_len == 0)) ||
 	     (flags & TH_RST)))
 		len = 0;
 	/* Without fast-open there should never be data sent on a SYN */
 	if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
 		len = 0;
 	if (len <= 0) {
 		/*
 		 * If FIN has been sent but not acked, but we haven't been
 		 * called to retransmit, len will be < 0.  Otherwise, window
 		 * shrank after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back to (closed)
 		 * window, and set the persist timer if it isn't already
 		 * going.  If the window didn't close completely, just wait
 		 * for an ACK.
 		 *
 		 * We also do a general check here to ensure that we will
 		 * set the persist timer when we have data to send, but a
 		 * 0-byte window. This makes sure the persist timer is set
 		 * even if the packet hits one of the "goto send" lines
 		 * below.
 		 */
 		len = 0;
 		if ((tp->snd_wnd == 0) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 		    (sb_offset < (int)sbavail(sb))) {
 			tp->snd_nxt = tp->snd_una;
 			rack_enter_persist(tp, rack, cts);
 		}
 	}
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	tcp_sndbuf_autoscale(tp, so, sendwin);
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
 	 * options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per
 	 * generated segment or packet.
 	 *
 	 * IPv4 handling has a clear separation of ip options and ip header
 	 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
 	 * the right thing below to provide length of just ip options and thus
 	 * checking for ipoptlen is enough to decide if ip options are present.
 	 */
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 		if (tp->t_inpcb->inp_options)
 			ipoptlen = tp->t_inpcb->inp_options->m_len -
 			    offsetof(struct ipoption, ipopt_list);
 		else
 			ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
 #ifdef INET
 	else
 #endif
 #endif				/* INET6 */
 #ifdef INET
 	if (IPSEC_ENABLED(ipv4))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
 #endif				/* INET */
 #endif
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
 	    (tp->t_port == 0) &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 	    ipoptlen == 0)
 		tso = 1;
 	{
 		uint32_t outstanding;
 
 		outstanding = tp->snd_max - tp->snd_una;
 		if (tp->t_flags & TF_SENTFIN) {
 			/*
 			 * If we sent a fin, snd_max is 1 higher than
 			 * snd_una
 			 */
 			outstanding--;
 		}
 		if (outstanding > 0) {
 			/*
 			 * This is sub-optimal. We only send a stand alone
 			 * FIN on its own segment.
 			 */
 			if (flags & TH_FIN) {
 				flags &= ~TH_FIN;
 				would_have_fin = 1;
 			}
 		} else if (sack_rxmit) {
 			if ((rsm->r_flags & RACK_HAS_FIN) == 0)
 				flags &= ~TH_FIN;
 		} else {
 			if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
 			    sbused(sb)))
 				flags &= ~TH_FIN;
 		}
 	}
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 * - We have a full segment (or more with TSO) - This is the last
 	 * buffer in a write()/send() and we are either idle or running
 	 * NODELAY - we've timed out (e.g. persist timer) - we have more
 	 * then 1/2 the maximum send window's worth of data (receiver may be
 	 * limited the window size) - we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg) {
 			pass = 1;
 			goto send;
 		}
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause us
 		 * to flush a buffer queued with moretocome.  XXX
 		 *
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			pass = 2;
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA) {	/* typ. timeout case */
 			pass = 3;
 			goto send;
 		}
 		if ((tp->snd_una == tp->snd_max) && len) {	/* Nothing outstanding */
 			goto send;
 		}
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 			pass = 4;
 			goto send;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {	/* retransmit case */
 			pass = 5;
 			goto send;
 		}
 		if (sack_rxmit) {
 			pass = 6;
 			goto send;
 		}
 	}
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read from
 	 * the receive buffer, no matter how small, causes a window update
 	 * to be sent.  We also should avoid sending a flurry of window
 	 * updates when the socket buffer had queued a lot of data and the
 	 * application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending an
 	 * update when we can increase the advertized window by more than
 	 * 1/4th of the socket buffer capacity.  When the buffer is getting
 	 * full or is very small be more aggressive and send an update
 	 * whenever we can increase by two mss sized segments. In all other
 	 * situations the ACK's to new incoming data will carry further
 	 * window increases.
 	 *
 	 * Don't send an independent window update if a delayed ACK is
 	 * pending (it will get piggy-backed on it) or the remote side
 	 * already has done a half-close and won't send more data.  Skip
 	 * this if the connection is in T/TCP half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window, taking
 		 * into account that we are limited by TCP_MAXWIN <<
 		 * tp->rcv_scale.
 		 */
 		int32_t adv;
 		int oldwin;
 
 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/*
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (int32_t)(2 * tp->t_maxseg) &&
 		    (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
 		    recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
 		    so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
 			pass = 7;
 			goto send;
 		}
 		if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW) {
 		pass = 8;
 		goto send;
 	}
 	if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
 		pass = 9;
 		goto send;
 	}
 	if (SEQ_GT(tp->snd_up, tp->snd_una)) {
 		pass = 10;
 		goto send;
 	}
 	/*
 	 * If our state indicates that FIN should be sent and we have not
 	 * yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN) {
 		if ((tp->t_flags & TF_SENTFIN) ||
 		    (((tp->t_flags & TF_SENTFIN) == 0) &&
 		     (tp->snd_nxt == tp->snd_una))) {
 			pass = 11;
 			goto send;
 		}
 	}
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(sb);
 just_return_nolock:
 	if (tot_len_this_send == 0)
 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
 	rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
 	rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
 	tp->t_flags &= ~TF_FORCEDATA;
 	return (0);
 
 send:
 	if (doing_tlp == 0) {
 		/*
 		 * Data not a TLP, and its not the rxt firing. If it is the
 		 * rxt firing, we want to leave the tlp_in_progress flag on
 		 * so we don't send another TLP. It has to be a rack timer
 		 * or normal send (response to acked data) to clear the tlp
 		 * in progress flag.
 		 */
 		rack->rc_tlp_in_progress = 0;
 	}
 	SOCKBUF_LOCK_ASSERT(sb);
 	if (len > 0) {
 		if (len >= tp->t_maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options unless TCP
 	 * set not to do any options. NOTE: we assume that the IP/TCP header
 	 * plus TCP options always fit in a single mbuf, leaving room for a
 	 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
 	 * + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof(struct tcpiphdr);
 
 	/*
 	 * Compute options for segment. We only have to care about SYN and
 	 * established connection segments.  Options for SYN-ACK segments
 	 * are handled in TCP syncache.
 	 */
 	to.to_flags = 0;
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&inp->inp_inc);
 #ifdef NETFLIX_TCPOUDP
 			if (tp->t_port)
 				to.to_mss -= V_tcp_udp_tunneling_overhead;
 #endif
 			to.to_flags |= TOF_MSS;
 
 			/*
 			 * On SYN or SYN|ACK transmits on TFO connections,
 			 * only include the TFO option if it is not a
 			 * retransmit, as the presence of the TFO option may
 			 * have caused the original SYN or SYN|ACK to have
 			 * been dropped by a middlebox.
 			 */
 			if (IS_FASTOPEN(tp->t_flags) &&
 			    (tp->t_rxtshift == 0)) {
 				if (tp->t_state == TCPS_SYN_RECEIVED) {
 					to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
 					to.to_tfo_cookie =
 					    (u_int8_t *)&tp->t_tfo_cookie.server;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 				} else if (tp->t_state == TCPS_SYN_SENT) {
 					to.to_tfo_len =
 					    tp->t_tfo_client_cookie_len;
 					to.to_tfo_cookie =
 					    tp->t_tfo_cookie.client;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 					/*
 					 * If we wind up having more data to
 					 * send with the SYN than can fit in
 					 * one segment, don't send any more
 					 * until the SYN|ACK comes back from
 					 * the other end.
 					 */
 					sendalot = 0;
 				}
 			}
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = cts + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 		/* Set receive buffer autosizing timestamp. */
 		if (tp->rfbuf_ts == 0 &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 			tp->rfbuf_ts = tcp_ts_getticks();
 		/* Selective ACK's. */
 		if (flags & TH_SYN)
 			to.to_flags |= TOF_SACKPERM;
 		else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    tp->rcv_numsacks > 0) {
 			to.to_flags |= TOF_SACK;
 			to.to_nsacks = tp->rcv_numsacks;
 			to.to_sacks = (u_char *)tp->sackblks;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif				/* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 		/*
 		 * If we wanted a TFO option to be added, but it was unable
 		 * to fit, ensure no data is sent.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
 #ifdef NETFLIX_TCPOUDP
 	if (tp->t_port) {
 		if (V_tcp_udp_tunneling_port == 0) {
 			/* The port was removed?? */
 			SOCKBUF_UNLOCK(&so->so_snd);
 			return (EHOSTUNREACH);
 		}
 		hdrlen += sizeof(struct udphdr);
 	}
 #endif
 	ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will bump the packet
 	 * length beyond the t_maxseg length. Clear the FIN bit because we
 	 * cut off the tail of the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxseg) {
 		if (flags & TH_FIN) {
 			would_have_fin = 1;
 			flags &= ~TH_FIN;
 		}
 		if (tso) {
 			uint32_t if_hw_tsomax;
 			uint32_t moff;
 			int32_t max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen -
 				    max_linkhdr);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 			/*
 			 * Prevent the last segment from being fractional
 			 * unless the send sockbuf can be emptied:
 			 */
 			max_len = (tp->t_maxseg - optlen);
 			if ((sb_offset + len) < sbavail(sb)) {
 				moff = len % (u_int)max_len;
 				if (moff != 0) {
 					len -= moff;
 					sendalot = 1;
 				}
 			}
 			/*
 			 * In case there are too many small fragments don't
 			 * use TSO:
 			 */
 			if (len <= max_len) {
 				len = max_len;
 				sendalot = 1;
 				tso = 0;
 			}
 			/*
 			 * Send the FIN in a separate segment after the bulk
 			 * sending is done. We don't trust the TSO
 			 * implementations to clear the FIN flag on all but
 			 * the last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 
 		} else {
 			len = tp->t_maxseg - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	} else
 		tso = 0;
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 #ifdef DIAGNOSTIC
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 #endif
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further
 	 * down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	if ((len == 0) &&
 	    (flags & TH_FIN) &&
 	    (sbused(sb))) {
 		/*
 		 * We have outstanding data, don't send a fin by itself!.
 		 */
 		goto just_return;
 	}
 	/*
 	 * Grab a header mbuf, attaching a copy of data to be transmitted,
 	 * and initialize the header from the template for sends on this
 	 * connection.
 	 */
 	if (len) {
 		uint32_t max_val;
 		uint32_t moff;
 
 		if (rack->rc_pace_max_segs)
 			max_val = rack->rc_pace_max_segs * tp->t_maxseg;
 		else
 			max_val = len;
 		/*
 		 * We allow a limit on sending with hptsi.
 		 */
 		if (len > max_val) {
 			len = max_val;
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(sb);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf to the
 		 * sb_offset in the socket buffer chain.
 		 */
 		mb = sbsndptr_noadv(sb, sb_offset, &moff);
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t)+hdrlen);
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				sbsndptr_adv(sb, mb, len);
 			m->m_len += len;
 		} else {
 			struct sockbuf *msb;
 
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				msb = NULL;
 			else
 				msb = sb;
 			m->m_next = tcp_m_copym(mb, moff, &len,
 			    if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
 			if (len <= (tp->t_maxseg - optlen)) {
 				/* 
 				 * Must have ran out of mbufs for the copy
 				 * shorten it to no longer need tso. Lets
 				 * not put on sendalot since we are low on
 				 * mbufs.
 				 */
 				tso = 0;
 			}
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(sb);
 				(void)m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
 			TCPSTAT_INC(tcps_sndprobe);
 #ifdef NETFLIX_STATS
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				stats_voi_update_abs_u32(tp->t_stats,
 				    VOI_TCP_RETXPB, len);
 			else
 				stats_voi_update_abs_u64(tp->t_stats,
 				    VOI_TCP_TXPB, len);
 #endif
 		} else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			if (rsm && (rsm->r_flags & RACK_TLP)) {
 				/*
 				 * TLP should not count in retran count, but
 				 * in its own bin
 				 */
 				counter_u64_add(rack_tlp_retran, 1);
 				counter_u64_add(rack_tlp_retran_bytes, len);
 			} else {
 				tp->t_sndrexmitpack++;
 				TCPSTAT_INC(tcps_sndrexmitpack);
 				TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 			}
 #ifdef NETFLIX_STATS
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
 			    len);
 #endif
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 #ifdef NETFLIX_STATS
 			stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
 			    len);
 #endif
 		}
 		/*
 		 * If we're sending everything we've got, set PUSH. (This
 		 * will keep happy those implementations which only give
 		 * data to the user when a buffer fills or a PUSH comes in.)
 		 */
 		if (sb_offset + len == sbused(sb) &&
 		    sbused(sb) &&
 		    !(flags & TH_SYN))
 			flags |= TH_PUSH;
 
 		/*
 		 * Are we doing hptsi, if so we must calculate the slot. We
 		 * only do hptsi in ESTABLISHED and with no RESET being
 		 * sent where we have data to send.
 		 */
 		if (((tp->t_state == TCPS_ESTABLISHED) ||
 		    (tp->t_state == TCPS_CLOSE_WAIT) ||
 		    ((tp->t_state == TCPS_FIN_WAIT_1) &&
 		    ((tp->t_flags & TF_SENTFIN) == 0) &&
 		    ((flags & TH_FIN) == 0))) &&
 		    ((flags & TH_RST) == 0) &&
 		    (rack->rc_always_pace)) {
 			/*
 			 * We use the most optimistic possible cwnd/srtt for
 			 * sending calculations. This will make our
 			 * calculation anticipate getting more through
 			 * quicker then possible. But thats ok we don't want
 			 * the peer to have a gap in data sending.
 			 */
 			uint32_t srtt, cwnd, tr_perms = 0;
 	
 			if (rack->r_ctl.rc_rack_min_rtt)
 				srtt = rack->r_ctl.rc_rack_min_rtt;
 			else
 				srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
 			if (rack->r_ctl.rc_rack_largest_cwnd)
 				cwnd = rack->r_ctl.rc_rack_largest_cwnd;
 			else
 				cwnd = tp->snd_cwnd;
 			tr_perms = cwnd / srtt;
 			if (tr_perms == 0) {
 				tr_perms = tp->t_maxseg;
 			}
 			tot_len_this_send += len;
 			/*
 			 * Calculate how long this will take to drain, if
 			 * the calculation comes out to zero, thats ok we
 			 * will use send_a_lot to possibly spin around for
 			 * more increasing tot_len_this_send to the point
 			 * that its going to require a pace, or we hit the
 			 * cwnd. Which in that case we are just waiting for
 			 * a ACK.
 			 */
 			slot = tot_len_this_send / tr_perms;
 			/* Now do we reduce the time so we don't run dry? */
 			if (slot && rack->rc_pace_reduce) {
 				int32_t reduce;
 
 				reduce = (slot / rack->rc_pace_reduce);
 				if (reduce < slot) {
 					slot -= reduce;
 				} else
 					slot = 0;
 			}
 			if (rack->r_enforce_min_pace &&
 			    (slot == 0) &&
 			    (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
 				/* We are enforcing a minimum pace time of 1ms */
 				slot = rack->r_enforce_min_pace;
 			}
 		}
 		SOCKBUF_UNLOCK(sb);
 	} else {
 		SOCKBUF_UNLOCK(sb);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			M_ALIGN(m, hdrlen);
 		} else
 #endif
 			m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 #ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else
 #endif
 			th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(inp, ip6, th);
 	} else
 #endif				/* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
 #ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			ulen = hdrlen + len - sizeof(struct ip);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else
 #endif
 			th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(inp, ip, th);
 	}
 	/*
 	 * Fill in fields, remembering maximum advertised window for use in
 	 * delaying messages about window sizes. If resending a FIN, be sure
 	 * not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup SYN packet. If we
 	 * are on a retransmit, we may resend those bits a number of times
 	 * as per RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE | TH_CWR;
 		} else
 			flags |= TH_ECE | TH_CWR;
 	}
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags & TF_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with ECN capable
 		 * transmission (ECT). Ignore pure ack packets,
 		 * retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 		}
 		/*
 		 * Reply with proper ECN notifications.
 		 */
 		if (tp->t_flags & TF_ECN_SND_CWR) {
 			flags |= TH_CWR;
 			tp->t_flags &= ~TF_ECN_SND_CWR;
 		}
 		if (tp->t_flags & TF_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 	/*
 	 * If we are doing retransmissions, then snd_nxt will not reflect
 	 * the first unsent octet.  For ACK only packets, we do not want the
 	 * sequence number of the retransmitted packet, we want the sequence
 	 * number of the next unsent octet.  So, if there is no data (and no
 	 * SYN or FIN), use snd_max instead of snd_nxt when filling in
 	 * ti_seq.  But if we are in persist state, snd_max might reflect
 	 * one byte beyond the right edge of the window, so use snd_nxt in
 	 * that case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN | TH_FIN)) ||
 		    rack->rc_in_persist) {
 			th->th_seq = htonl(tp->snd_nxt);
 			rack_seq = tp->snd_nxt;
 		} else if (flags & TH_RST) {
 			/*
 			 * For a Reset send the last cum ack in sequence
 			 * (this like any other choice may still generate a
 			 * challenge ack, if a ack-update packet is in
 			 * flight).
 			 */
 			th->th_seq = htonl(tp->snd_una);
 			rack_seq = tp->snd_una;
 		} else {
 			th->th_seq = htonl(tp->snd_max);
 			rack_seq = tp->snd_max;
 		}
 	} else {
 		th->th_seq = htonl(rsm->r_start);
 		rack_seq = rsm->r_start;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window, but avoid silly
 	 * window syndrome.
 	 */
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
 	 * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
 	 * handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 		    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
 	 * window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is
 	 * attempting to read more data than can be buffered prior to
 	 * transmitting on the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull the urgent
 		 * pointer to the left edge of the send window so that it
 		 * doesn't drift into the send window on sequence number
 		 * wraparound.
 		 */
 		tp->snd_up = tp->snd_una;	/* drag it along */
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		/*
 		 * Calculate MD5 signature and put it into the place
 		 * determined before.
 		 * NOTE: since TCP options buffer doesn't point into
 		 * mbuf's data, calculate offset and use it.
 		 */
 		if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
 		    (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
 			/*
 			 * Do not send segment if the calculation of MD5
 			 * digest has failed.
 			 */
 			goto out;
 		}
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then checksum extended
 	 * header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6,
 			    sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
 			    0);
 		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 			   ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 			    IPPROTO_TCP + len + optlen));
 		}
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments. The TCP pseudo
 	 * header checksum is always provided. XXX: Fixme: This is currently
 	 * not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
 	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
 #else
 	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
 	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
 #endif
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 #endif
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len	/* - hdrlen +
 			      * (th->th_off << 2) */ );
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 			ipov->ih_len = save;
 	}
 #endif				/* TCPDEBUG */
 
 	/* We're getting ready to send; log now. */
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		if (rsm || sack_rxmit) {
 			log.u_bbr.flex8 = 1;
 		} else {
 			log.u_bbr.flex8 = 0;
 		}
 		lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
 		    len, &log, false, NULL, NULL, 0, NULL);
 	} else
 		lgb = NULL;
 
 	/*
 	 * Fill in IP length and desired time to live and send to IP level.
 	 * There should be a better way to handle ttl and tos; we could keep
 	 * them in the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt. Also,
 		 * desired default hop limit might be changed via Neighbor
 		 * Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace
 		 * probes. ip6_output() will set it properly; it's supposed
 		 * to include the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
 		    &inp->inp_route6,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
 		    NULL, NULL, inp);
 
 		if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
 			mtu = inp->inp_route6.ro_rt->rt_mtu;
 	}
 #endif				/* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			ip->ip_ttl = in6_selecthlim(inp, NULL);
 #endif				/* INET6 */
 		/*
 		 * If we do path MTU discovery, then we set DF on every
 		 * packet. This might not be the best thing to do according
 		 * to RFC3390 Section 2. However the tcp hostcache migitates
 		 * the problem so it affects only the first tcp connection
 		 * with a host.
 		 *
 		 * NB: Don't set DF on small MTU/MSS to have a safe
 		 * fallback.
 		 */
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 			if (tp->t_port == 0 || len < V_tcp_minmss) {
 				ip->ip_off |= htons(IP_DF);
 			}
 		} else {
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 		}
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 		error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 		    inp);
 		if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
 			mtu = inp->inp_route.ro_rt->rt_mtu;
 	}
 #endif				/* INET */
 
 out:
 	if (lgb) {
 		lgb->tlb_errno = error;
 		lgb = NULL;
 	}
 	/*
 	 * In transmit state, time the transmission and arrange for the
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
 		if (len == 0)
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
 		else if (len == 1) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
 		} else if (len > 1) {
 			int idx;
 
 			idx = (len / tp->t_maxseg) + 3;
 			if (idx >= TCP_MSS_ACCT_ATIMER)
 				counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
 			else
 				counter_u64_add(rack_out_size[idx], 1);
 		}
 	}
 	if (sub_from_prr && (error == 0)) {
 		rack->r_ctl.rc_prr_sndcnt -= len;
 	}
 	sub_from_prr = 0;
 	rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
 	    pass, rsm);
 	if ((tp->t_flags & TF_FORCEDATA) == 0 ||
 	    (rack->rc_in_persist == 0)) {
 #ifdef NETFLIX_STATS
 		tcp_seq startseq = tp->snd_nxt;
 #endif
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (error)
 			/* We don't log or do anything with errors */
 			goto timer;
 
 		if (flags & (TH_SYN | TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		/* In the ENOBUFS case we do *not* update snd_max */
 		if (sack_rxmit)
 			goto timer;
 
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			if (tp->snd_una == tp->snd_max) {
 				/*
 				 * Update the time we just added data since
 				 * none was outstanding.
 				 */
 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 				tp->t_acktime = ticks;
 			}
 			tp->snd_max = tp->snd_nxt;
 #ifdef NETFLIX_STATS
 			if (!(tp->t_flags & TF_GPUTINPROG) && len) {
 				tp->t_flags |= TF_GPUTINPROG;
 				tp->gput_seq = startseq;
 				tp->gput_ack = startseq +
 				    ulmin(sbavail(sb) - sb_offset, sendwin);
 				tp->gput_ts = tcp_ts_getticks();
 			}
 #endif
 		}
 		/*
 		 * Set retransmit timer if not currently set, and not doing
 		 * a pure ack or a keep-alive probe. Initial value for
 		 * retransmit timer is smoothed round-trip time + 2 *
 		 * round-trip time variance. Initialize shift counter which
 		 * is used for backoff of retransmit time.
 		 */
 timer:
 		if ((tp->snd_wnd == 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			/*
 			 * If the persists timer was set above (right before
 			 * the goto send), and still needs to be on. Lets
 			 * make sure all is canceled. If the persist timer
 			 * is not running, we want to get it up.
 			 */
 			if (rack->rc_in_persist == 0) {
 				rack_enter_persist(tp, rack, cts);
 			}
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in persist
 		 * mode (no window) we do not update snd_nxt.
 		 */
 		int32_t xlen = len;
 
 		if (error)
 			goto nomore;
 
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		/* In the ENOBUFS case we do *not* update snd_max */
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
 			if (tp->snd_una == tp->snd_max) {
 				/*
 				 * Update the time we just added data since
 				 * none was outstanding.
 				 */
 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 				tp->t_acktime = ticks;
 			}
 			tp->snd_max = tp->snd_nxt + len;
 		}
 	}
 nomore:
 	if (error) {
 		SOCKBUF_UNLOCK_ASSERT(sb);	/* Check gotos. */
 		/*
 		 * Failures do not advance the seq counter above. For the
 		 * case of ENOBUFS we will fall out and retry in 1ms with
 		 * the hpts. Everything else will just have to retransmit
 		 * with the timer.
 		 *
 		 * In any case, we do not want to loop around for another
 		 * send without a good reason.
 		 */
 		sendalot = 0;
 		switch (error) {
 		case EPERM:
 			tp->t_flags &= ~TF_FORCEDATA;
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 			if (slot == 0) {
 				/*
 				 * Pace us right away to retry in a some
 				 * time
 				 */
 				slot = 1 + rack->rc_enobuf;
 				if (rack->rc_enobuf < 255)
 					rack->rc_enobuf++;
 				if (slot > (rack->rc_rack_rtt / 2)) {
 					slot = rack->rc_rack_rtt / 2;
 				}
 				if (slot < 10)
 					slot = 10;
 			}
 			counter_u64_add(rack_saw_enobuf, 1);
 			error = 0;
 			goto enobufs;
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU. If TSO was active we either got an
 			 * interface without TSO capabilits or TSO was
 			 * turned off. If we obtained mtu from ip_output()
 			 * then update it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			slot = 10;
 			rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
 			tp->t_flags &= ~TF_FORCEDATA;
 			return (error);
 		case ENETUNREACH:
 			counter_u64_add(rack_saw_enetunreach, 1);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 			}
 			/* FALLTHROUGH */
 		default:
 			slot = 10;
 			rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
 			tp->t_flags &= ~TF_FORCEDATA;
 			return (error);
 		}
 	} else {
 		rack->rc_enobuf = 0;
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell). If this advertises a larger
 	 * window than any other segment, then remember the size of the
 	 * advertised window. Any pending ACK has now been sent.
 	 */
 	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 enobufs:
 	rack->r_tlp_running = 0;
 	if ((flags & TH_RST) || (would_have_fin == 1)) {
 		/*
 		 * We don't send again after a RST. We also do *not* send
 		 * again if we would have had a find, but now have
 		 * outstanding data.
 		 */
 		slot = 0;
 		sendalot = 0;
 	}
 	if (slot) {
 		/* set the rack tcb into the slot N */
 		counter_u64_add(rack_paced_segments, 1);
 	} else if (sendalot) {
 		if (len)
 			counter_u64_add(rack_unpaced_segments, 1);
 		sack_rxmit = 0;
 		tp->t_flags &= ~TF_FORCEDATA;
 		goto again;
 	} else if (len) {
 		counter_u64_add(rack_unpaced_segments, 1);
 	}
 	tp->t_flags &= ~TF_FORCEDATA;
 	rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
 	return (error);
 }
 
 /*
  * rack_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 static int
 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t error = 0, optval;
 
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PROP_RATE:
 	case TCP_RACK_PROP:
 	case TCP_RACK_TLP_REDUCE:
 	case TCP_RACK_EARLY_RECOV:
 	case TCP_RACK_PACE_ALWAYS:
 	case TCP_DELACK:
 	case TCP_RACK_PACE_REDUCE:
 	case TCP_RACK_PACE_MAX_SEG:
 	case TCP_RACK_PRR_SENDALOT:
 	case TCP_RACK_MIN_TO:
 	case TCP_RACK_EARLY_SEG:
 	case TCP_RACK_REORD_THRESH:
 	case TCP_RACK_REORD_FADE:
 	case TCP_RACK_TLP_THRESH:
 	case TCP_RACK_PKT_DELAY:
 	case TCP_RACK_TLP_USE:
 	case TCP_RACK_TLP_INC_VAR:
 	case TCP_RACK_IDLE_REDUCE_HIGH:
 	case TCP_RACK_MIN_PACE:
 	case TCP_RACK_MIN_PACE_SEG:
 	case TCP_BBR_RACK_RTT_USE:
 	case TCP_DATA_AFTER_CLOSE:
 		break;
 	default:
 		return (tcp_default_ctloutput(so, sopt, inp, tp));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
 	if (error)
 		return (error);
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PROP_RATE:
 		if ((optval <= 0) || (optval >= 100)) {
 			error = EINVAL;
 			break;
 		}
 		RACK_OPTS_INC(tcp_rack_prop_rate);
 		rack->r_ctl.rc_prop_rate = optval;
 		break;
 	case TCP_RACK_TLP_USE:
 		if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
 			error = EINVAL;
 			break;
 		}
 		RACK_OPTS_INC(tcp_tlp_use);
 		rack->rack_tlp_threshold_use = optval;
 		break;
 	case TCP_RACK_PROP:
 		/* RACK proportional rate reduction (bool) */
 		RACK_OPTS_INC(tcp_rack_prop);
 		rack->r_ctl.rc_prop_reduce = optval;
 		break;
 	case TCP_RACK_TLP_REDUCE:
 		/* RACK TLP cwnd reduction (bool) */
 		RACK_OPTS_INC(tcp_rack_tlp_reduce);
 		rack->r_ctl.rc_tlp_cwnd_reduce = optval;
 		break;
 	case TCP_RACK_EARLY_RECOV:
 		/* Should recovery happen early (bool) */
 		RACK_OPTS_INC(tcp_rack_early_recov);
 		rack->r_ctl.rc_early_recovery = optval;
 		break;
 	case TCP_RACK_PACE_ALWAYS:
 		/* Use the always pace method (bool)  */
 		RACK_OPTS_INC(tcp_rack_pace_always);
 		if (optval > 0)
 			rack->rc_always_pace = 1;
 		else
 			rack->rc_always_pace = 0;
 		break;
 	case TCP_RACK_PACE_REDUCE:
 		/* RACK Hptsi reduction factor (divisor) */
 		RACK_OPTS_INC(tcp_rack_pace_reduce);
 		if (optval)
 			/* Must be non-zero */
 			rack->rc_pace_reduce = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments in a pace */
 		RACK_OPTS_INC(tcp_rack_max_seg);
 		rack->rc_pace_max_segs = optval;
 		break;
 	case TCP_RACK_PRR_SENDALOT:
 		/* Allow PRR to send more than one seg */
 		RACK_OPTS_INC(tcp_rack_prr_sendalot);
 		rack->r_ctl.rc_prr_sendalot = optval;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		RACK_OPTS_INC(tcp_rack_min_to);
 		rack->r_ctl.rc_min_to = optval;
 		break;
 	case TCP_RACK_EARLY_SEG:
 		/* If early recovery max segments */
 		RACK_OPTS_INC(tcp_rack_early_seg);
 		rack->r_ctl.rc_early_recovery_segs = optval;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		RACK_OPTS_INC(tcp_rack_reord_thresh);
 		if ((optval > 0) && (optval < 31))
 			rack->r_ctl.rc_reorder_shift = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		RACK_OPTS_INC(tcp_rack_reord_fade);
 		rack->r_ctl.rc_reorder_fade = optval;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		RACK_OPTS_INC(tcp_rack_tlp_thresh);
 		if (optval)
 			rack->r_ctl.rc_tlp_threshold = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		RACK_OPTS_INC(tcp_rack_pkt_delay);
 		rack->r_ctl.rc_pkt_delay = optval;
 		break;
 	case TCP_RACK_TLP_INC_VAR:
 		/* Does TLP include rtt variance in t-o */
 		RACK_OPTS_INC(tcp_rack_tlp_inc_var);
 		rack->r_ctl.rc_prr_inc_var = optval;
 		break;
 	case TCP_RACK_IDLE_REDUCE_HIGH:
 		RACK_OPTS_INC(tcp_rack_idle_reduce_high);
 		if (optval)
 			rack->r_idle_reduce_largest = 1;
 		else
 			rack->r_idle_reduce_largest = 0;
 		break;
 	case TCP_DELACK:
 		if (optval == 0)
 			tp->t_delayed_ack = 0;
 		else
 			tp->t_delayed_ack = 1;
 		if (tp->t_flags & TF_DELACK) {
 			tp->t_flags &= ~TF_DELACK;
 			tp->t_flags |= TF_ACKNOW;
 			rack_output(tp);
 		}
 		break;
 	case TCP_RACK_MIN_PACE:
 		RACK_OPTS_INC(tcp_rack_min_pace);
 		if (optval > 3)
 			rack->r_enforce_min_pace = 3;
 		else
 			rack->r_enforce_min_pace = optval;
 		break;
 	case TCP_RACK_MIN_PACE_SEG:
 		RACK_OPTS_INC(tcp_rack_min_pace_seg);
 		if (optval >= 16)
 			rack->r_min_pace_seg_thresh = 15;
 		else
 			rack->r_min_pace_seg_thresh = optval;
 		break;
 	case TCP_BBR_RACK_RTT_USE:
 		if ((optval != USE_RTT_HIGH) &&
 		    (optval != USE_RTT_LOW) &&
 		    (optval != USE_RTT_AVG))
 			error = EINVAL;
 		else
 			rack->r_ctl.rc_rate_sample_method = optval;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		if (optval)
 			rack->rc_allow_data_af_clo = 1;
 		else
 			rack->rc_allow_data_af_clo = 0;
 		break;
 	default:
 		return (tcp_default_ctloutput(so, sopt, inp, tp));
 		break;
 	}
 #ifdef NETFLIX_STATS
 	tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
 #endif
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static int
 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t error, optval;
 
 	/*
 	 * Because all our options are either boolean or an int, we can just
 	 * pull everything into optval and then unlock and copy. If we ever
 	 * add a option that is not a int, then this will have quite an
 	 * impact to this routine.
 	 */
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PROP_RATE:
 		optval = rack->r_ctl.rc_prop_rate;
 		break;
 	case TCP_RACK_PROP:
 		/* RACK proportional rate reduction (bool) */
 		optval = rack->r_ctl.rc_prop_reduce;
 		break;
 	case TCP_RACK_TLP_REDUCE:
 		/* RACK TLP cwnd reduction (bool) */
 		optval = rack->r_ctl.rc_tlp_cwnd_reduce;
 		break;
 	case TCP_RACK_EARLY_RECOV:
 		/* Should recovery happen early (bool) */
 		optval = rack->r_ctl.rc_early_recovery;
 		break;
 	case TCP_RACK_PACE_REDUCE:
 		/* RACK Hptsi reduction factor (divisor) */
 		optval = rack->rc_pace_reduce;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments in a pace */
 		optval = rack->rc_pace_max_segs;
 		break;
 	case TCP_RACK_PACE_ALWAYS:
 		/* Use the always pace method */
 		optval = rack->rc_always_pace;
 		break;
 	case TCP_RACK_PRR_SENDALOT:
 		/* Allow PRR to send more than one seg */
 		optval = rack->r_ctl.rc_prr_sendalot;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		optval = rack->r_ctl.rc_min_to;
 		break;
 	case TCP_RACK_EARLY_SEG:
 		/* If early recovery max segments */
 		optval = rack->r_ctl.rc_early_recovery_segs;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		optval = rack->r_ctl.rc_reorder_shift;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		optval = rack->r_ctl.rc_reorder_fade;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		optval = rack->r_ctl.rc_tlp_threshold;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		optval = rack->r_ctl.rc_pkt_delay;
 		break;
 	case TCP_RACK_TLP_USE:
 		optval = rack->rack_tlp_threshold_use;
 		break;
 	case TCP_RACK_TLP_INC_VAR:
 		/* Does TLP include rtt variance in t-o */
 		optval = rack->r_ctl.rc_prr_inc_var;
 		break;
 	case TCP_RACK_IDLE_REDUCE_HIGH:
 		optval = rack->r_idle_reduce_largest;
 		break;
 	case TCP_RACK_MIN_PACE:
 		optval = rack->r_enforce_min_pace;
 		break;
 	case TCP_RACK_MIN_PACE_SEG:
 		optval = rack->r_min_pace_seg_thresh;
 		break;
 	case TCP_BBR_RACK_RTT_USE:
 		optval = rack->r_ctl.rc_rate_sample_method;
 		break;
 	case TCP_DELACK:
 		optval = tp->t_delayed_ack;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		optval = rack->rc_allow_data_af_clo;
 		break;
 	default:
 		return (tcp_default_ctloutput(so, sopt, inp, tp));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	error = sooptcopyout(sopt, &optval, sizeof optval);
 	return (error);
 }
 
 static int
 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
 {
 	int32_t error = EINVAL;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack == NULL) {
 		/* Huh? */
 		goto out;
 	}
 	if (sopt->sopt_dir == SOPT_SET) {
 		return (rack_set_sockopt(so, sopt, inp, tp, rack));
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		return (rack_get_sockopt(so, sopt, inp, tp, rack));
 	}
 out:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 
 struct tcp_function_block __tcp_rack = {
 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
 	.tfb_tcp_output = rack_output,
 	.tfb_tcp_do_segment = rack_do_segment,
 	.tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
 	.tfb_tcp_ctloutput = rack_ctloutput,
 	.tfb_tcp_fb_init = rack_init,
 	.tfb_tcp_fb_fini = rack_fini,
 	.tfb_tcp_timer_stop_all = rack_stopall,
 	.tfb_tcp_timer_activate = rack_timer_activate,
 	.tfb_tcp_timer_active = rack_timer_active,
 	.tfb_tcp_timer_stop = rack_timer_stop,
 	.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
 	.tfb_tcp_handoff_ok = rack_handoff_ok
 };
 
 static const char *rack_stack_names[] = {
 	__XSTRING(STACKNAME),
 #ifdef STACKALIAS
 	__XSTRING(STACKALIAS),
 #endif
 };
 
 static int
 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
 {
 	memset(mem, 0, size);
 	return (0);
 }
 
 static void
 rack_dtor(void *mem, int32_t size, void *arg)
 {
 
 }
 
 static bool rack_mod_inited = false;
 
 static int
 tcp_addrack(module_t mod, int32_t type, void *data)
 {
 	int32_t err = 0;
 	int num_stacks;
 
 	switch (type) {
 	case MOD_LOAD:
 		rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
 		    sizeof(struct rack_sendmap),
 		    rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 		rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
 		    sizeof(struct tcp_rack),
 		    rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
 		sysctl_ctx_init(&rack_sysctl_ctx);
 		rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
 		    OID_AUTO,
 		    __XSTRING(STACKNAME),
 		    CTLFLAG_RW, 0,
 		    "");
 		if (rack_sysctl_root == NULL) {
 			printf("Failed to add sysctl node\n");
 			err = EFAULT;
 			goto free_uma;
 		}
 		rack_init_sysctls();
 		num_stacks = nitems(rack_stack_names);
 		err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
 		    rack_stack_names, &num_stacks);
 		if (err) {
 			printf("Failed to register %s stack name for "
 			    "%s module\n", rack_stack_names[num_stacks],
 			    __XSTRING(MODNAME));
 			sysctl_ctx_free(&rack_sysctl_ctx);
 free_uma:
 			uma_zdestroy(rack_zone);
 			uma_zdestroy(rack_pcb_zone);
 			rack_counter_destroy();
 			printf("Failed to register rack module -- err:%d\n", err);
 			return (err);
 		}
 		rack_mod_inited = true;
 		break;
 	case MOD_QUIESCE:
 		err = deregister_tcp_functions(&__tcp_rack, true, false);
 		break;
 	case MOD_UNLOAD:
 		err = deregister_tcp_functions(&__tcp_rack, false, true);
 		if (err == EBUSY)
 			break;
 		if (rack_mod_inited) {
 			uma_zdestroy(rack_zone);
 			uma_zdestroy(rack_pcb_zone);
 			sysctl_ctx_free(&rack_sysctl_ctx);
 			rack_counter_destroy();
 			rack_mod_inited = false;
 		}
 		err = 0;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (err);
 }
 
 static moduledata_t tcp_rack = {
 	.name = __XSTRING(MODNAME),
 	.evhand = tcp_addrack,
 	.priv = 0
 };
 
 MODULE_VERSION(MODNAME, 1);
 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
Index: head/sys/netinet/tcp_timewait.c
===================================================================
--- head/sys/netinet/tcp_timewait.c	(revision 335355)
+++ head/sys/netinet/tcp_timewait.c	(revision 335356)
@@ -1,761 +1,753 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #ifndef INVARIANTS
 #include <sys/syslog.h>
 #endif
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static VNET_DEFINE(uma_zone_t, tcptw_zone);
 #define	V_tcptw_zone		VNET(tcptw_zone)
 static int	maxtcptw;
 
 /*
  * The timed wait queue contains references to each of the TCP sessions
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
  * timewait lock, which must be held over queue iteration and modification.
  *
  * Rules on tcptw usage:
  *  - a inpcb is always freed _after_ its tcptw
  *  - a tcptw relies on its inpcb reference counting for memory stability
  *  - a tcptw is dereferenceable only while its inpcb is locked
  */
 static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
 #define	V_twq_2msl		VNET(twq_2msl)
 
 /* Global timewait lock */
 static VNET_DEFINE(struct rwlock, tw_lock);
 #define	V_tw_lock		VNET(tw_lock)
 
 #define	TW_LOCK_INIT(tw, d)	rw_init_flags(&(tw), (d), 0)
 #define	TW_LOCK_DESTROY(tw)	rw_destroy(&(tw))
 #define	TW_RLOCK(tw)		rw_rlock(&(tw))
 #define	TW_WLOCK(tw)		rw_wlock(&(tw))
 #define	TW_RUNLOCK(tw)		rw_runlock(&(tw))
 #define	TW_WUNLOCK(tw)		rw_wunlock(&(tw))
 #define	TW_LOCK_ASSERT(tw)	rw_assert(&(tw), RA_LOCKED)
 #define	TW_RLOCK_ASSERT(tw)	rw_assert(&(tw), RA_RLOCKED)
 #define	TW_WLOCK_ASSERT(tw)	rw_assert(&(tw), RA_WLOCKED)
 #define	TW_UNLOCK_ASSERT(tw)	rw_assert(&(tw), RA_UNLOCKED)
 
 static void	tcp_tw_2msl_reset(struct tcptw *, int);
 static void	tcp_tw_2msl_stop(struct tcptw *, int);
 static int	tcp_twrespond(struct tcptw *, int);
 
 static int
 tcptw_auto_size(void)
 {
 	int halfrange;
 
 	/*
 	 * Max out at half the ephemeral port range so that TIME_WAIT
 	 * sockets don't tie up too many ephemeral ports.
 	 */
 	if (V_ipport_lastauto > V_ipport_firstauto)
 		halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
 	else
 		halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
 	/* Protect against goofy port ranges smaller than 32. */
 	return (imin(imax(halfrange, 32), maxsockets / 5));
 }
 
 static int
 sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	if (maxtcptw == 0)
 		new = tcptw_auto_size();
 	else
 		new = maxtcptw;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr)
 		if (new >= 32) {
 			maxtcptw = new;
 			uma_zone_set_max(V_tcptw_zone, maxtcptw);
 		}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
     &maxtcptw, 0, sysctl_maxtcptw, "IU",
     "Maximum number of compressed TCP TIME_WAIT entries");
 
 static VNET_DEFINE(int, nolocaltimewait) = 0;
 #define	V_nolocaltimewait	VNET(nolocaltimewait)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(nolocaltimewait), 0,
     "Do not create compressed TCP TIME_WAIT entries for local connections");
 
 void
 tcp_tw_zone_change(void)
 {
 
 	if (maxtcptw == 0)
 		uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
 }
 
 void
 tcp_tw_init(void)
 {
 
 	V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
 	if (maxtcptw == 0)
 		uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
 	else
 		uma_zone_set_max(V_tcptw_zone, maxtcptw);
 	TAILQ_INIT(&V_twq_2msl);
 	TW_LOCK_INIT(V_tw_lock, "tcptw");
 }
 
 #ifdef VIMAGE
 void
 tcp_tw_destroy(void)
 {
 	struct tcptw *tw;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
 		tcp_twclose(tw, 0);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	TW_LOCK_DESTROY(V_tw_lock);
 	uma_zdestroy(V_tcptw_zone);
 }
 #endif
 
 /*
  * Move a TCP connection into TIME_WAIT state.
  *    tcbinfo is locked.
  *    inp is locked, and is unlocked before returning.
  */
 void
 tcp_twstart(struct tcpcb *tp)
 {
 	struct tcptw twlocal, *tw;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 	bool acknow, local;
 #ifdef INET6
 	bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/* A dropped inp should never transition to TIME_WAIT state. */
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: "
 	    "(inp->inp_flags & INP_DROPPED) != 0"));
 
 	if (V_nolocaltimewait) {
 #ifdef INET6
 		if (isipv6)
 			local = in6_localaddr(&inp->in6p_faddr);
 		else
 #endif
 #ifdef INET
 			local = in_localip(inp->inp_faddr);
 #else
 			local = false;
 #endif
 	} else
 		local = false;
 
 	/*
 	 * For use only by DTrace.  We do not reference the state
 	 * after this point so modifying it in place is not a problem.
 	 */
 	tcp_state_change(tp, TCPS_TIME_WAIT);
 
 	if (local)
 		tw = &twlocal;
 	else
 		tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
 	if (tw == NULL) {
 		/*
 		 * Reached limit on total number of TIMEWAIT connections
 		 * allowed. Remove a connection from TIMEWAIT queue in LRU
 		 * fashion to make room for this connection.
 		 *
 		 * XXX:  Check if it possible to always have enough room
 		 * in advance based on guarantees provided by uma_zalloc().
 		 */
 		tw = tcp_tw_2msl_scan(1);
 		if (tw == NULL) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 			return;
 		}
 	}
 	/*
 	 * For !local case the tcptw will hold a reference on its inpcb
 	 * until tcp_twclose is called.
 	 */
 	tw->tw_inpcb = inp;
 
 	/*
 	 * Recover last window size sent.
 	 */
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
 		tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
 	else
 		tw->last_win = 0;
 
 	/*
 	 * Set t_recent if timestamps are used on the connection.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
 		tw->t_recent = tp->ts_recent;
 		tw->ts_offset = tp->ts_offset;
 	} else {
 		tw->t_recent = 0;
 		tw->ts_offset = 0;
 	}
 
 	tw->snd_nxt = tp->snd_nxt;
 	tw->rcv_nxt = tp->rcv_nxt;
 	tw->iss     = tp->iss;
 	tw->irs     = tp->irs;
 	tw->t_starttime = tp->t_starttime;
 	tw->tw_time = 0;
 
 /* XXX
  * If this code will
  * be used for fin-wait-2 state also, then we may need
  * a ts_recent from the last segment.
  */
 	acknow = tp->t_flags & TF_ACKNOW;
 
 	/*
 	 * First, discard tcpcb state, which includes stopping its timers and
 	 * freeing it.  tcp_discardcb() used to also release the inpcb, but
 	 * that work is now done in the caller.
 	 *
 	 * Note: soisdisconnected() call used to be made in tcp_discardcb(),
 	 * and might not be needed here any longer.
 	 */
 	tcp_discardcb(tp);
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	tw->tw_so_options = so->so_options;
 	inp->inp_flags |= INP_TIMEWAIT;
 	if (acknow)
 		tcp_twrespond(tw, TH_ACK);
 	if (local)
 		in_pcbdrop(inp);
 	else {
 		in_pcbref(inp);	/* Reference from tw */
 		tw->tw_cred = crhold(so->so_cred);
 		inp->inp_ppcb = tw;
 		TCPSTATES_INC(TCPS_TIME_WAIT);
 		tcp_tw_2msl_reset(tw, 0);
 	}
 
 	/*
 	 * If the inpcb owns the sole reference to the socket, then we can
 	 * detach and free the socket as it is not needed in time wait.
 	 */
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_twstart: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 	} else
 		INP_WUNLOCK(inp);
 }
 
 /*
  * Returns 1 if the TIME_WAIT state was killed and we should start over,
  * looking for a pcb in the listen state.  Returns 0 otherwise.
  */
 int
 tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
     struct mbuf *m, int tlen)
 {
 	struct tcptw *tw;
 	int thflags;
 	tcp_seq seq;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
 	 * still present.  This is undesirable, but temporarily necessary
 	 * until we work out how to handle inpcb's who's timewait state has
 	 * been removed.
 	 */
 	tw = intotw(inp);
 	if (tw == NULL)
 		goto drop;
 
 	thflags = th->th_flags;
 
 	/*
 	 * NOTE: for FIN_WAIT_2 (to be added later),
 	 * must validate sequence number before accepting RST
 	 */
 
 	/*
 	 * If the segment contains RST:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST)
 		goto drop;
 
 #if 0
 /* PAWS not needed at the moment */
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 		if ((thflags & TH_ACK) == 0)
 			goto drop;
 		goto ack;
 	}
 	/*
 	 * ts_recent is never updated because we never accept new segments.
 	 */
 #endif
 
 	/*
 	 * If a new connection request is received
 	 * while in TIME_WAIT, drop the old connection
 	 * and start over if the sequence numbers
 	 * are above the previous ones.
 	 */
 	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
 		tcp_twclose(tw, 0);
 		return (1);
 	}
 
 	/*
 	 * Drop the segment if it does not contain an ACK.
 	 */
 	if ((thflags & TH_ACK) == 0)
 		goto drop;
 
 	/*
 	 * Reset the 2MSL timer if this is a duplicate FIN.
 	 */
 	if (thflags & TH_FIN) {
 		seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
 		if (seq + 1 == tw->rcv_nxt)
 			tcp_tw_2msl_reset(tw, 1);
 	}
 
 	/*
 	 * Acknowledge the segment if it has data or is not a duplicate ACK.
 	 */
 	if (thflags != TH_ACK || tlen != 0 ||
 	    th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
 		tcp_twrespond(tw, TH_ACK);
 drop:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 void
 tcp_twclose(struct tcptw *tw, int reuse)
 {
 	struct socket *so;
 	struct inpcb *inp;
 
 	/*
 	 * At this point, we are in one of two situations:
 	 *
 	 * (1) We have no socket, just an inpcb<->twtcp pair.  We can free
 	 *     all state.
 	 *
 	 * (2) We have a socket -- if we own a reference, release it and
 	 *     notify the socket layer.
 	 */
 	inp = tw->tw_inpcb;
 	KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
 	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);	/* in_pcbfree() */
 	INP_WLOCK_ASSERT(inp);
 
 	tcp_tw_2msl_stop(tw, reuse);
 	inp->inp_ppcb = NULL;
 	in_pcbdrop(inp);
 
 	so = inp->inp_socket;
 	if (so != NULL) {
 		/*
 		 * If there's a socket, handle two cases: first, we own a
 		 * strong reference, which we will now release, or we don't
 		 * in which case another reference exists (XXXRW: think
 		 * about this more), and we don't need to take action.
 		 */
 		if (inp->inp_flags & INP_SOCKREF) {
 			inp->inp_flags &= ~INP_SOCKREF;
 			INP_WUNLOCK(inp);
 			SOCK_LOCK(so);
 			KASSERT(so->so_state & SS_PROTOREF,
 			    ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
 			so->so_state &= ~SS_PROTOREF;
 			sofree(so);
 		} else {
 			/*
 			 * If we don't own the only reference, the socket and
 			 * inpcb need to be left around to be handled by
 			 * tcp_usr_detach() later.
 			 */
 			INP_WUNLOCK(inp);
 		}
 	} else {
 		/*
 		 * The socket has been already cleaned-up for us, only free the
 		 * inpcb.
 		 */
 		in_pcbfree(inp);
 	}
 	TCPSTAT_INC(tcps_closed);
 }
 
 static int
 tcp_twrespond(struct tcptw *tw, int flags)
 {
 	struct inpcb *inp = tw->tw_inpcb;
 #if defined(INET6) || defined(INET)
 	struct tcphdr *th = NULL;
 #endif
 	struct mbuf *m;
 #ifdef INET
 	struct ip *ip = NULL;
 #endif
 	u_int hdrlen, optlen;
 	int error = 0;			/* Keep compiler happy */
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 	hdrlen = 0;                     /* Keep compiler happy */
 
 	INP_WLOCK_ASSERT(inp);
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_data += max_linkhdr;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(inp, ip6, th);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		hdrlen = sizeof(struct tcpiphdr);
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(inp, ip, th);
 	}
 #endif
 	to.to_flags = 0;
 
 	/*
 	 * Send a timestamp and echo-reply if both our side and our peer
 	 * have sent timestamps in our SYN's and this is not a RST.
 	 */
 	if (tw->t_recent && flags == TH_ACK) {
 		to.to_flags |= TOF_TS;
 		to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
 		to.to_tsecr = tw->t_recent;
 	}
 	optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 	m->m_len = hdrlen + optlen;
 	m->m_pkthdr.len = m->m_len;
 
 	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
 
 	th->th_seq = htonl(tw->snd_nxt);
 	th->th_ack = htonl(tw->rcv_nxt);
 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	th->th_flags = flags;
 	th->th_win = htons(tw->last_win);
 
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6,
 		    sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 		error = ip6_output(m, inp->in6p_outputopts, NULL,
 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
 		ip->ip_len = htons(m->m_pkthdr.len);
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 		error = ip_output(m, inp->inp_options, NULL,
 		    ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
 		    NULL, inp);
 	}
 #endif
 	if (flags & TH_ACK)
 		TCPSTAT_INC(tcps_sndacks);
 	else
 		TCPSTAT_INC(tcps_sndctrl);
 	TCPSTAT_INC(tcps_sndtotal);
 	return (error);
 }
 
 static void
 tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
 {
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tw->tw_inpcb);
 
 	TW_WLOCK(V_tw_lock);
 	if (rearm)
 		TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	tw->tw_time = ticks + 2 * tcp_msl;
 	TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
 	TW_WUNLOCK(V_tw_lock);
 }
 
 static void
 tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
 {
 	struct ucred *cred;
 	struct inpcb *inp;
 	int released __unused;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
 	TW_WLOCK(V_tw_lock);
 	inp = tw->tw_inpcb;
 	tw->tw_inpcb = NULL;
 
 	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	cred = tw->tw_cred;
 	tw->tw_cred = NULL;
 	TW_WUNLOCK(V_tw_lock);
 
 	if (cred != NULL)
 		crfree(cred);
 
 	released = in_pcbrele_wlocked(inp);
 	KASSERT(!released, ("%s: inp should not be released here", __func__));
 
 	if (!reuse)
 		uma_zfree(V_tcptw_zone, tw);
 	TCPSTATES_DEC(TCPS_TIME_WAIT);
 }
 
 struct tcptw *
 tcp_tw_2msl_scan(int reuse)
 {
 	struct tcptw *tw;
 	struct inpcb *inp;
 
 #ifdef INVARIANTS
 	if (reuse) {
 		/*
 		 * Exclusive pcbinfo lock is not required in reuse case even if
 		 * two inpcb locks can be acquired simultaneously:
 		 *  - the inpcb transitioning to TIME_WAIT state in
 		 *    tcp_tw_start(),
 		 *  - the inpcb closed by tcp_twclose().
 		 *
 		 * It is because only inpcbs in FIN_WAIT2 or CLOSING states can
 		 * transition in TIME_WAIT state.  Then a pcbcb cannot be in
 		 * TIME_WAIT list and transitioning to TIME_WAIT state at same
 		 * time.
 		 */
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	for (;;) {
 		TW_RLOCK(V_tw_lock);
 		tw = TAILQ_FIRST(&V_twq_2msl);
 		if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
 			TW_RUNLOCK(V_tw_lock);
 			break;
 		}
 		KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
 		    __func__));
 
 		inp = tw->tw_inpcb;
 		in_pcbref(inp);
 		TW_RUNLOCK(V_tw_lock);
 
-		if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
-
-			INP_WLOCK(inp);
-			tw = intotw(inp);
-			if (in_pcbrele_wlocked(inp)) {
-				if (__predict_true(tw == NULL)) {
-					INP_INFO_RUNLOCK(&V_tcbinfo);
-					continue;
-				} else {
-					/* This should not happen as in TIMEWAIT
-					 * state the inp should not be destroyed
-					 * before its tcptw. If INVARIANTS is
-					 * defined panic.
-					 */
+		INP_INFO_RLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		tw = intotw(inp);
+		if (in_pcbrele_wlocked(inp)) {
+			if (__predict_true(tw == NULL)) {
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+				continue;
+			} else {
+				/* This should not happen as in TIMEWAIT
+				 * state the inp should not be destroyed
+				 * before its tcptw. If INVARIANTS is
+				 * defined panic.
+				 */
 #ifdef INVARIANTS
-					panic("%s: Panic before an infinite "
-					    "loop: INP_TIMEWAIT && (INP_FREED "
-					    "|| inp last reference) && tw != "
-					    "NULL", __func__);
+				panic("%s: Panic before an infinite "
+					  "loop: INP_TIMEWAIT && (INP_FREED "
+					  "|| inp last reference) && tw != "
+					  "NULL", __func__);
 #else
-					log(LOG_ERR, "%s: Avoid an infinite "
-					    "loop: INP_TIMEWAIT && (INP_FREED "
-					    "|| inp last reference) && tw != "
-					    "NULL", __func__);
+				log(LOG_ERR, "%s: Avoid an infinite "
+					"loop: INP_TIMEWAIT && (INP_FREED "
+					"|| inp last reference) && tw != "
+					"NULL", __func__);
 #endif
-					INP_INFO_RUNLOCK(&V_tcbinfo);
-					break;
-				}
-			}
-
-			if (tw == NULL) {
-				/* tcp_twclose() has already been called */
-				INP_WUNLOCK(inp);
 				INP_INFO_RUNLOCK(&V_tcbinfo);
-				continue;
+				break;
 			}
+		}
 
-			tcp_twclose(tw, reuse);
+		if (tw == NULL) {
+			/* tcp_twclose() has already been called */
+			INP_WUNLOCK(inp);
 			INP_INFO_RUNLOCK(&V_tcbinfo);
-			if (reuse)
-			    return tw;
-		} else {
-			/* INP_INFO lock is busy, continue later. */
-			INP_WLOCK(inp);
-			if (!in_pcbrele_wlocked(inp))
-				INP_WUNLOCK(inp);
-			break;
+			continue;
 		}
+
+		tcp_twclose(tw, reuse);
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+		if (reuse)
+			return tw;
 	}
 
 	return NULL;
 }