diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index 7e87e7c740e3..88e29157289d 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1,2189 +1,2296 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) #include #endif #include #include #include #ifdef RSS #include #include #endif #include #include #if defined(INET) || defined(INET6) #include #include #endif #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include struct ktls_wq { struct mtx mtx; STAILQ_HEAD(, mbuf) m_head; STAILQ_HEAD(, socket) so_head; bool running; int lastallocfail; } __aligned(CACHE_LINE_SIZE); struct ktls_domain_info { int count; int cpu[MAXCPU]; }; struct ktls_domain_info ktls_domains[MAXMEMDOM]; static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; static uma_zone_t ktls_session_zone; static uma_zone_t ktls_buffer_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload stats"); #ifdef RSS static int ktls_bind_threads = 1; #else static int ktls_bind_threads; #endif SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, &ktls_bind_threads, 0, "Bind crypto threads to cores (1) or cores and domains (2) at boot"); static u_int ktls_maxlen = 16384; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD, &ktls_number_threads, 0, "Number of TLS threads in thread-pool"); +unsigned int ktls_ifnet_max_rexmit_pct = 2; +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN, + &ktls_ifnet_max_rexmit_pct, 2, + "Max percent bytes retransmitted before ifnet TLS is disabled"); + static bool ktls_offload_enable; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN, &ktls_offload_enable, 0, "Enable support for kernel TLS offload"); static bool ktls_cbc_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN, &ktls_cbc_enable, 1, "Enable Support of AES-CBC crypto for kernel TLS"); static bool ktls_sw_buffer_cache = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN, &ktls_sw_buffer_cache, 1, "Enable caching of output buffers for SW encryption"); static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active); SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, &ktls_cnt_tx_queued, "Number of TLS records in queue to tasks for SW encryption"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, &ktls_cnt_rx_queued, "Number of TLS sockets in queue to tasks for SW decryption"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_total); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, CTLFLAG_RD, &ktls_offload_total, "Total successful TLS setups (parameters set)"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls, CTLFLAG_RD, &ktls_offload_enable_calls, "Total number of TLS enable calls made"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_active); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, &ktls_offload_corrupted_records, "Total corrupted TLS records received"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD, &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD, &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD, &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet"); +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail); +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD, + &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet"); + +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok); +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD, + &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet"); + SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Hardware (ifnet) TLS session stats"); #ifdef TCP_OFFLOAD SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TOE TLS session stats"); #endif static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc, "Active number of software TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm, "Active number of software TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_sw_chacha20, "Active number of software TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD, &ktls_ifnet_cbc, "Active number of ifnet TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD, &ktls_ifnet_gcm, "Active number of ifnet TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_ifnet_chacha20, "Active number of ifnet TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD, &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD, &ktls_ifnet_reset_dropped, "TLS sessions dropped after failing to update ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD, &ktls_ifnet_reset_failed, "TLS sessions that failed to allocate a new ifnet send tag"); static int ktls_ifnet_permitted; SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN, &ktls_ifnet_permitted, 1, "Whether to permit hardware (ifnet) TLS sessions"); #ifdef TCP_OFFLOAD static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD, &ktls_toe_cbc, "Active number of TOE TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD, &ktls_toe_gcm, "Active number of TOE TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_toe_chacha20, "Active number of TOE TLS sessions using Chacha20-Poly1305"); #endif static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS"); static void ktls_cleanup(struct ktls_session *tls); #if defined(INET) || defined(INET6) static void ktls_reset_send_tag(void *context, int pending); #endif static void ktls_work_thread(void *ctx); #if defined(INET) || defined(INET6) static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; #ifdef NUMA struct ktls_domain_info *di; #endif u_int cpuid; inp = sotoinpcb(so); #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid != NETISR_CPUID_NONE) return (cpuid); #endif /* * Just use the flowid to shard connections in a repeatable * fashion. Note that TLS 1.0 sessions rely on the * serialization provided by having the same connection use * the same queue. */ #ifdef NUMA if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) { di = &ktls_domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } else #endif cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif static int ktls_buffer_import(void *arg, void **store, int count, int domain, int flags) { vm_page_t m; int i; KASSERT((ktls_maxlen & PAGE_MASK) == 0, ("%s: ktls max length %d is not page size-aligned", __func__, ktls_maxlen)); for (i = 0; i < count; i++) { m = vm_page_alloc_contig_domain(NULL, 0, domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags), atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if (m == NULL) break; store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } return (i); } static void ktls_buffer_release(void *arg __unused, void **store, int count) { vm_page_t m; int i, j; for (i = 0; i < count; i++) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); for (j = 0; j < atop(ktls_maxlen); j++) { (void)vm_page_unwire_noq(m + j); vm_page_free(m + j); } } } static void ktls_free_mext_contig(struct mbuf *m) { M_ASSERTEXTPG(m); uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); } static void ktls_init(void *dummy __unused) { struct thread *td; struct pcpu *pc; cpuset_t mask; int count, domain, error, i; ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, M_WAITOK | M_ZERO); ktls_session_zone = uma_zcreate("ktls_session", sizeof(struct ktls_session), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); if (ktls_sw_buffer_cache) { ktls_buffer_zone = uma_zcache_create("ktls_buffers", roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL, ktls_buffer_import, ktls_buffer_release, NULL, UMA_ZONE_FIRSTTOUCH); } /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&ktls_wq[i].m_head); STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); if (error) panic("Can't add KTLS thread %d error %d", i, error); /* * Bind threads to cores. If ktls_bind_threads is > * 1, then we bind to the NUMA domain. */ if (ktls_bind_threads) { if (ktls_bind_threads > 1) { pc = pcpu_find(i); domain = pc->pc_domain; CPU_COPY(&cpuset_domain[domain], &mask); count = ktls_domains[domain].count; ktls_domains[domain].cpu[count] = i; ktls_domains[domain].count++; } else { CPU_SETOF(i, &mask); } error = cpuset_setthread(td->td_tid, &mask); if (error) panic( "Unable to bind KTLS thread for CPU %d error %d", i, error); } ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } /* * If we somehow have an empty domain, fall back to choosing * among all KTLS threads. */ if (ktls_bind_threads > 1) { for (i = 0; i < vm_ndomains; i++) { if (ktls_domains[i].count == 0) { ktls_bind_threads = 1; break; } } } if (bootverbose) printf("KTLS: Initialized %d threads\n", ktls_number_threads); } SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL); #if defined(INET) || defined(INET6) static int ktls_create_session(struct socket *so, struct tls_enable *en, struct ktls_session **tlsp) { struct ktls_session *tls; int error; /* Only TLS 1.0 - 1.3 are supported. */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || en->tls_vminor > TLS_MINOR_VER_THREE) return (EINVAL); if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv)) return (EINVAL); /* All supported algorithms require a cipher key. */ if (en->cipher_key_len == 0) return (EINVAL); /* No flags are currently supported. */ if (en->flags != 0) return (EINVAL); /* Common checks for supported algorithms. */ switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * auth_algorithm isn't used, but permit GMAC values * for compatibility. */ switch (en->auth_algorithm) { case 0: #ifdef COMPAT_FREEBSD12 /* XXX: Really 13.0-current COMPAT. */ case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: #endif break; default: return (EINVAL); } if (en->auth_key_len != 0) return (EINVAL); if ((en->tls_vminor == TLS_MINOR_VER_TWO && en->iv_len != TLS_AEAD_GCM_LEN) || (en->tls_vminor == TLS_MINOR_VER_THREE && en->iv_len != TLS_1_3_GCM_IV_LEN)) return (EINVAL); break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: /* * TLS 1.0 requires an implicit IV. TLS 1.1+ * all use explicit IVs. */ if (en->tls_vminor == TLS_MINOR_VER_ZERO) { if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN) return (EINVAL); break; } /* FALLTHROUGH */ case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: /* Ignore any supplied IV. */ en->iv_len = 0; break; default: return (EINVAL); } if (en->auth_key_len == 0) return (EINVAL); break; case CRYPTO_CHACHA20_POLY1305: if (en->auth_algorithm != 0 || en->auth_key_len != 0) return (EINVAL); if (en->tls_vminor != TLS_MINOR_VER_TWO && en->tls_vminor != TLS_MINOR_VER_THREE) return (EINVAL); if (en->iv_len != TLS_CHACHA20_IV_LEN) return (EINVAL); break; default: return (EINVAL); } tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); tls->wq_index = ktls_get_cpu(so); tls->params.cipher_algorithm = en->cipher_algorithm; tls->params.auth_algorithm = en->auth_algorithm; tls->params.tls_vmajor = en->tls_vmajor; tls->params.tls_vminor = en->tls_vminor; tls->params.flags = en->flags; tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen); /* Set the header and trailer lengths. */ tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte * nonce. TLS 1.3 uses a 12 byte implicit IV. */ if (en->tls_vminor < TLS_MINOR_VER_THREE) tls->params.tls_hlen += sizeof(uint64_t); tls->params.tls_tlen = AES_GMAC_HASH_LEN; tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: if (en->tls_vminor == TLS_MINOR_VER_ZERO) { /* Implicit IV, no nonce. */ } else { tls->params.tls_hlen += AES_BLOCK_LEN; } tls->params.tls_tlen = AES_BLOCK_LEN + SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_384_HASH_LEN; break; default: panic("invalid hmac"); } tls->params.tls_bs = AES_BLOCK_LEN; break; case CRYPTO_CHACHA20_POLY1305: /* * Chacha20 uses a 12 byte implicit IV. */ tls->params.tls_tlen = POLY1305_HASH_LEN; tls->params.tls_bs = 1; break; default: panic("invalid cipher"); } /* * TLS 1.3 includes optional padding which we do not support, * and also puts the "real" record type at the end of the * encrypted data. */ if (en->tls_vminor == TLS_MINOR_VER_THREE) tls->params.tls_tlen += sizeof(uint8_t); KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN, ("TLS header length too long: %d", tls->params.tls_hlen)); KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN, ("TLS trailer length too long: %d", tls->params.tls_tlen)); if (en->auth_key_len != 0) { tls->params.auth_key_len = en->auth_key_len; tls->params.auth_key = malloc(en->auth_key_len, M_KTLS, M_WAITOK); error = copyin(en->auth_key, tls->params.auth_key, en->auth_key_len); if (error) goto out; } tls->params.cipher_key_len = en->cipher_key_len; tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK); error = copyin(en->cipher_key, tls->params.cipher_key, en->cipher_key_len); if (error) goto out; /* * This holds the implicit portion of the nonce for AEAD * ciphers and the initial implicit IV for TLS 1.0. The * explicit portions of the IV are generated in ktls_frame(). */ if (en->iv_len != 0) { tls->params.iv_len = en->iv_len; error = copyin(en->iv, tls->params.iv, en->iv_len); if (error) goto out; /* * For TLS 1.2 with GCM, generate an 8-byte nonce as a * counter to generate unique explicit IVs. * * Store this counter in the last 8 bytes of the IV * array so that it is 8-byte aligned. */ if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && en->tls_vminor == TLS_MINOR_VER_TWO) arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0); } *tlsp = tls; return (0); out: ktls_cleanup(tls); return (error); } static struct ktls_session * ktls_clone_session(struct ktls_session *tls) { struct ktls_session *tls_new; tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); /* Copy fields from existing session. */ tls_new->params = tls->params; tls_new->wq_index = tls->wq_index; /* Deep copy keys. */ if (tls_new->params.auth_key != NULL) { tls_new->params.auth_key = malloc(tls->params.auth_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.auth_key, tls->params.auth_key, tls->params.auth_key_len); } tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.cipher_key, tls->params.cipher_key, tls->params.cipher_key_len); return (tls_new); } #endif static void ktls_cleanup(struct ktls_session *tls) { counter_u64_add(ktls_offload_active, -1); switch (tls->mode) { case TCP_TLS_MODE_SW: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, -1); break; } ktls_ocf_free(tls); break; case TCP_TLS_MODE_IFNET: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, -1); break; } if (tls->snd_tag != NULL) m_snd_tag_rele(tls->snd_tag); break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, -1); break; } break; #endif } if (tls->params.auth_key != NULL) { zfree(tls->params.auth_key, M_KTLS); tls->params.auth_key = NULL; tls->params.auth_key_len = 0; } if (tls->params.cipher_key != NULL) { zfree(tls->params.cipher_key, M_KTLS); tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); } #if defined(INET) || defined(INET6) #ifdef TCP_OFFLOAD static int ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction) { struct inpcb *inp; struct tcpcb *tp; int error; inp = so->so_pcb; INP_WLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); if (!(tp->t_flags & TF_TOE)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = tcp_offload_alloc_tls_session(tp, tls, direction); INP_WUNLOCK(inp); if (error == 0) { tls->mode = TCP_TLS_MODE_TOE; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, 1); break; } } return (error); } #endif /* * Common code used when first enabling ifnet TLS on a connection or * when allocating a new ifnet TLS session due to a routing change. * This function allocates a new TLS send tag on whatever interface * the connection is currently routed over. */ static int ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct nhop_object *nh; struct tcpcb *tp; int error; INP_RLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. * * - Always permit 'force' requests. * - ktls_ifnet_permitted == 0: always deny. */ if (!force && ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: Use the cached route in the inpcb to find the * interface. This should perhaps instead use * rtalloc1_fib(dst, 0, 0, fibnum). Since KTLS is only * enabled after a connection has completed key negotiation in * userland, the cached route will be present in practice. */ nh = inp->inp_route.ro_nh; if (nh == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = nh->nh_ifp; if_ref(ifp); /* * Allocate a TLS + ratelimit tag if the connection has an * existing pacing rate. */ if (tp->t_pacing_rate != -1 && (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) { params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT; params.tls_rate_limit.inp = inp; params.tls_rate_limit.tls = tls; params.tls_rate_limit.max_rate = tp->t_pacing_rate; } else { params.hdr.type = IF_SND_TAG_TYPE_TLS; params.tls.inp = inp; params.tls.tls = tls; } params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; INP_RUNLOCK(inp); if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { error = EOPNOTSUPP; goto out; } if (inp->inp_vflag & INP_IPV6) { if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) { error = EOPNOTSUPP; goto out; } } error = m_snd_tag_alloc(ifp, ¶ms, mstp); out: if_rele(ifp); return (error); } static int ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force) { struct m_snd_tag *mst; int error; error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst); if (error == 0) { tls->mode = TCP_TLS_MODE_IFNET; tls->snd_tag = mst; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, 1); break; } } return (error); } static int ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction) { int error; error = ktls_ocf_try(so, tls, direction); if (error) return (error); tls->mode = TCP_TLS_MODE_SW; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, 1); break; } return (0); } /* * KTLS RX stores data in the socket buffer as a list of TLS records, * where each record is stored as a control message containg the TLS * header followed by data mbufs containing the decrypted data. This * is different from KTLS TX which always uses an mb_ext_pgs mbuf for * both encrypted and decrypted data. TLS records decrypted by a NIC * should be queued to the socket buffer as records, but encrypted * data which needs to be decrypted by software arrives as a stream of * regular mbufs which need to be converted. In addition, there may * already be pending encrypted data in the socket buffer when KTLS RX * is enabled. * * To manage not-yet-decrypted data for KTLS RX, the following scheme * is used: * * - A single chain of NOTREADY mbufs is hung off of sb_mtls. * * - ktls_check_rx checks this chain of mbufs reading the TLS header * from the first mbuf. Once all of the data for that TLS record is * queued, the socket is queued to a worker thread. * * - The worker thread calls ktls_decrypt to decrypt TLS records in * the TLS chain. Each TLS record is detached from the TLS chain, * decrypted, and inserted into the regular socket buffer chain as * record starting with a control message holding the TLS header and * a chain of mbufs holding the encrypted data. */ static void sb_mark_notready(struct sockbuf *sb) { struct mbuf *m; m = sb->sb_mb; sb->sb_mtls = m; sb->sb_mb = NULL; sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; for (; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", __func__)); KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", __func__)); KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", __func__)); m->m_flags |= M_NOTREADY; sb->sb_acc -= m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mtlstail = m; } KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, sb->sb_ccc)); } int ktls_enable_rx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; int error; if (!ktls_offload_enable) return (ENOTSUP); if (SOLISTENING(so)) return (EINVAL); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_rcv.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS 1.3 is not yet supported. */ if (en->tls_vmajor == TLS_MAJOR_VER_ONE && en->tls_vminor == TLS_MINOR_VER_THREE) return (ENOTSUP); error = ktls_create_session(so, en, &tls); if (error) return (error); #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_RX); if (error) #endif error = ktls_try_sw(so, tls, KTLS_RX); if (error) { ktls_cleanup(tls); return (error); } /* Mark the socket as using TLS offload. */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); so->so_rcv.sb_tls_info = tls; so->so_rcv.sb_flags |= SB_TLS_RX; /* Mark existing data as not ready until it can be decrypted. */ if (tls->mode != TCP_TLS_MODE_TOE) { sb_mark_notready(&so->so_rcv); ktls_check_rx(&so->so_rcv); } SOCKBUF_UNLOCK(&so->so_rcv); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; struct inpcb *inp; int error; if (!ktls_offload_enable) return (ENOTSUP); if (SOLISTENING(so)) return (EINVAL); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_snd.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS requires ext pgs */ if (mb_use_ext_pgs == 0) return (ENXIO); error = ktls_create_session(so, en, &tls); if (error) return (error); /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_TX); if (error) #endif error = ktls_try_ifnet(so, tls, false); if (error) error = ktls_try_sw(so, tls, KTLS_TX); if (error) { ktls_cleanup(tls); return (error); } error = sblock(&so->so_snd, SBL_WAIT); if (error) { ktls_cleanup(tls); return (error); } /* * Write lock the INP when setting sb_tls_info so that * routines in tcp_ratelimit.c can read sb_tls_info while * holding the INP lock. */ inp = so->so_pcb; INP_WLOCK(inp); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; if (tls->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); sbunlock(&so->so_snd); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_get_rx_mode(struct socket *so) { struct ktls_session *tls; struct inpcb *inp; int mode; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_rcv); tls = so->so_rcv.sb_tls_info; if (tls == NULL) mode = TCP_TLS_MODE_NONE; else mode = tls->mode; SOCKBUF_UNLOCK(&so->so_rcv); return (mode); } int ktls_get_tx_mode(struct socket *so) { struct ktls_session *tls; struct inpcb *inp; int mode; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) mode = TCP_TLS_MODE_NONE; else mode = tls->mode; SOCKBUF_UNLOCK(&so->so_snd); return (mode); } /* * Switch between SW and ifnet TLS sessions as requested. */ int ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; int error; if (SOLISTENING(so)) return (EINVAL); switch (mode) { case TCP_TLS_MODE_SW: case TCP_TLS_MODE_IFNET: break; default: return (EINVAL); } inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } if (tls->mode == mode) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } tls = ktls_hold(tls); SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); tls_new = ktls_clone_session(tls); if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else error = ktls_try_sw(so, tls_new, KTLS_TX); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } error = sblock(&so->so_snd, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } /* * If we raced with another session change, keep the existing * session. */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); sbunlock(&so->so_snd); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (EBUSY); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; if (tls_new->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); sbunlock(&so->so_snd); /* * Drop two references on 'tls'. The first is for the * ktls_hold() above. The second drops the reference from the * socket buffer. */ KASSERT(tls->refcount >= 2, ("too few references on old session")); ktls_free(tls); ktls_free(tls); if (mode == TCP_TLS_MODE_IFNET) counter_u64_add(ktls_switch_to_ifnet, 1); else counter_u64_add(ktls_switch_to_sw, 1); INP_WLOCK(inp); return (0); } /* * Try to allocate a new TLS send tag. This task is scheduled when * ip_output detects a route change while trying to transmit a packet * holding a TLS record. If a new tag is allocated, replace the tag * in the TLS session. Subsequent packets on the connection will use * the new tag. If a new tag cannot be allocated, drop the * connection. */ static void ktls_reset_send_tag(void *context, int pending) { struct epoch_tracker et; struct ktls_session *tls; struct m_snd_tag *old, *new; struct inpcb *inp; struct tcpcb *tp; int error; MPASS(pending == 1); tls = context; inp = tls->inp; /* * Free the old tag first before allocating a new one. * ip[6]_output_send() will treat a NULL send tag the same as * an ifp mismatch and drop packets until a new tag is * allocated. * * Write-lock the INP when changing tls->snd_tag since * ip[6]_output_send() holds a read-lock when reading the * pointer. */ INP_WLOCK(inp); old = tls->snd_tag; tls->snd_tag = NULL; INP_WUNLOCK(inp); if (old != NULL) m_snd_tag_rele(old); error = ktls_alloc_snd_tag(inp, tls, true, &new); if (error == 0) { INP_WLOCK(inp); tls->snd_tag = new; mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); /* * XXX: Should we kick tcp_output explicitly now that * the send tag is fixed or just rely on timers? */ } else { NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); CURVNET_SET(tp->t_vnet); tp = tcp_drop(tp, ECONNABORTED); CURVNET_RESTORE(); if (tp != NULL) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset_dropped, 1); } else INP_WUNLOCK(inp); } NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); /* * Leave reset_pending true to avoid future tasks while * the socket goes away. */ } ktls_free(tls); } int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) { if (inp == NULL) return (ENOBUFS); INP_LOCK_ASSERT(inp); /* * See if we should schedule a task to update the send tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); in_pcbref(inp); tls->inp = inp; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); return (ENOBUFS); } #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; /* Can't get to the inp, but it should be locked. */ /* INP_LOCK_ASSERT(inp); */ MPASS(tls->mode == TCP_TLS_MODE_IFNET); if (tls->snd_tag == NULL) { /* * Resetting send tag, ignore this change. The * pending reset may or may not see this updated rate * in the tcpcb. If it doesn't, we will just lose * this rate change. */ return (0); } MPASS(tls->snd_tag != NULL); MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT); mst = tls->snd_tag; ifp = mst->ifp; return (ifp->if_snd_tag_modify(mst, ¶ms)); } #endif #endif void ktls_destroy(struct ktls_session *tls) { ktls_cleanup(tls); uma_zfree(ktls_session_zone, tls); } void ktls_seq(struct sockbuf *sb, struct mbuf *m) { for (; m != NULL; m = m->m_next) { KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_seq: mapped mbuf %p", m)); m->m_epg_seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; } } /* * Add TLS framing (headers and trailers) to a chain of mbufs. Each * mbuf in the chain must be an unmapped mbuf. The payload of the * mbuf must be populated with the payload of each TLS record. * * The record_type argument specifies the TLS record type used when * populating the TLS header. * * The enq_count argument on return is set to the number of pages of * payload data for this entire chain that need to be encrypted via SW * encryption. The returned value should be passed to ktls_enqueue * when scheduling encryption of this chain of mbufs. To handle the * special case of empty fragments for TLS 1.0 sessions, an empty * fragment counts as one page. */ void ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type) { struct tls_record_layer *tlshdr; struct mbuf *m; uint64_t *noncep; uint16_t tls_len; int maxlen; maxlen = tls->params.max_frame_len; *enq_cnt = 0; for (m = top; m != NULL; m = m->m_next) { /* * All mbufs in the chain should be TLS records whose * payload does not exceed the maximum frame length. * * Empty TLS records are permitted when using CBC. */ KASSERT(m->m_len <= maxlen && (tls->params.cipher_algorithm == CRYPTO_AES_CBC ? m->m_len >= 0 : m->m_len > 0), ("ktls_frame: m %p len %d\n", m, m->m_len)); /* * TLS frames require unmapped mbufs to store session * info. */ KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top)); tls_len = m->m_len; /* Save a reference to the session. */ m->m_epg_tls = ktls_hold(tls); m->m_epg_hdrlen = tls->params.tls_hlen; m->m_epg_trllen = tls->params.tls_tlen; if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) { int bs, delta; /* * AES-CBC pads messages to a multiple of the * block size. Note that the padding is * applied after the digest and the encryption * is done on the "plaintext || mac || padding". * At least one byte of padding is always * present. * * Compute the final trailer length assuming * at most one block of padding. * tls->params.tls_tlen is the maximum * possible trailer length (padding + digest). * delta holds the number of excess padding * bytes if the maximum were used. Those * extra bytes are removed. */ bs = tls->params.tls_bs; delta = (tls_len + tls->params.tls_tlen) & (bs - 1); m->m_epg_trllen -= delta; } m->m_len += m->m_epg_hdrlen + m->m_epg_trllen; /* Populate the TLS header. */ tlshdr = (void *)m->m_epg_hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; /* * TLS 1.3 masquarades as TLS 1.2 with a record type * of TLS_RLTYPE_APP. */ if (tls->params.tls_vminor == TLS_MINOR_VER_THREE && tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) { tlshdr->tls_vminor = TLS_MINOR_VER_TWO; tlshdr->tls_type = TLS_RLTYPE_APP; /* save the real record type for later */ m->m_epg_record_type = record_type; m->m_epg_trail[0] = record_type; } else { tlshdr->tls_vminor = tls->params.tls_vminor; tlshdr->tls_type = record_type; } tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* * Store nonces / explicit IVs after the end of the * TLS header. * * For GCM with TLS 1.2, an 8 byte nonce is copied * from the end of the IV. The nonce is then * incremented for use by the next record. * * For CBC, a random nonce is inserted for TLS 1.1+. */ if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && tls->params.tls_vminor == TLS_MINOR_VER_TWO) { noncep = (uint64_t *)(tls->params.iv + 8); be64enc(tlshdr + 1, *noncep); (*noncep)++; } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor >= TLS_MINOR_VER_ONE) arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0); /* * When using SW encryption, mark the mbuf not ready. * It will be marked ready via sbready() after the * record has been encrypted. * * When using ifnet TLS, unencrypted TLS records are * sent down the stack to the NIC. */ if (tls->mode == TCP_TLS_MODE_SW) { m->m_flags |= M_NOTREADY; m->m_epg_nrdy = m->m_epg_npgs; if (__predict_false(tls_len == 0)) { /* TLS 1.0 empty fragment. */ *enq_cnt += 1; } else *enq_cnt += m->m_epg_npgs; } } } void ktls_check_rx(struct sockbuf *sb) { struct tls_record_layer hdr; struct ktls_wq *wq; struct socket *so; bool running; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", __func__, sb)); so = __containerof(sb, struct socket, so_rcv); if (sb->sb_flags & SB_TLS_RX_RUNNING) return; /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < sizeof(hdr)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) so->so_error = EMSGSIZE; return; } m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); /* Is the entire record queued? */ if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0) so->so_error = EMSGSIZE; return; } sb->sb_flags |= SB_TLS_RX_RUNNING; soref(so); wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_rx_queued, 1); } static struct mbuf * ktls_detach_record(struct sockbuf *sb, int len) { struct mbuf *m, *n, *top; int remain; SOCKBUF_LOCK_ASSERT(sb); MPASS(len <= sb->sb_tlscc); /* * If TLS chain is the exact size of the record, * just grab the whole record. */ top = sb->sb_mtls; if (sb->sb_tlscc == len) { sb->sb_mtls = NULL; sb->sb_mtlstail = NULL; goto out; } /* * While it would be nice to use m_split() here, we need * to know exactly what m_split() allocates to update the * accounting, so do it inline instead. */ remain = len; for (m = top; remain > m->m_len; m = m->m_next) remain -= m->m_len; /* Easy case: don't have to split 'm'. */ if (remain == m->m_len) { sb->sb_mtls = m->m_next; if (sb->sb_mtls == NULL) sb->sb_mtlstail = NULL; m->m_next = NULL; goto out; } /* * Need to allocate an mbuf to hold the remainder of 'm'. Try * with M_NOWAIT first. */ n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { /* * Use M_WAITOK with socket buffer unlocked. If * 'sb_mtls' changes while the lock is dropped, return * NULL to force the caller to retry. */ SOCKBUF_UNLOCK(sb); n = m_get(M_WAITOK, MT_DATA); SOCKBUF_LOCK(sb); if (sb->sb_mtls != top) { m_free(n); return (NULL); } } n->m_flags |= M_NOTREADY; /* Store remainder in 'n'. */ n->m_len = m->m_len - remain; if (m->m_flags & M_EXT) { n->m_data = m->m_data + remain; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); } /* Trim 'm' and update accounting. */ m->m_len -= n->m_len; sb->sb_tlscc -= n->m_len; sb->sb_ccc -= n->m_len; /* Account for 'n'. */ sballoc_ktls_rx(sb, n); /* Insert 'n' into the TLS chain. */ sb->sb_mtls = n; n->m_next = m->m_next; if (sb->sb_mtlstail == m) sb->sb_mtlstail = n; /* Detach the record from the TLS chain. */ m->m_next = NULL; out: MPASS(m_length(top, NULL) == len); for (m = top; m != NULL; m = m->m_next) sbfree_ktls_rx(sb, m); sb->sb_tlsdcc = len; sb->sb_ccc += len; SBCHECK(sb); return (top); } static void ktls_decrypt(struct socket *so) { char tls_header[MBUF_PEXT_HDR_LEN]; struct ktls_session *tls; struct sockbuf *sb; struct tls_record_layer *hdr; struct tls_get_record tgr; struct mbuf *control, *data, *m; uint64_t seqno; int error, remain, tls_len, trail_len; hdr = (struct tls_record_layer *)tls_header; sb = &so->so_rcv; SOCKBUF_LOCK(sb); KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, ("%s: socket %p not running", __func__, so)); tls = sb->sb_tls_info; MPASS(tls != NULL); for (;;) { /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < tls->params.tls_hlen) break; m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); if (hdr->tls_vmajor != tls->params.tls_vmajor || hdr->tls_vminor != tls->params.tls_vminor) error = EINVAL; else if (tls_len < tls->params.tls_hlen || tls_len > tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + tls->params.tls_tlen) error = EMSGSIZE; else error = 0; if (__predict_false(error != 0)) { /* * We have a corrupted record and are likely * out of sync. The connection isn't * recoverable at this point, so abort it. */ SOCKBUF_UNLOCK(sb); counter_u64_add(ktls_offload_corrupted_records, 1); CURVNET_SET(so->so_vnet); so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = error; CURVNET_RESTORE(); goto deref; } /* Is the entire record queued? */ if (sb->sb_tlscc < tls_len) break; /* * Split out the portion of the mbuf chain containing * this TLS record. */ data = ktls_detach_record(sb, tls_len); if (data == NULL) continue; MPASS(sb->sb_tlsdcc == tls_len); seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; SBCHECK(sb); SOCKBUF_UNLOCK(sb); error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* * sbcut/drop/flush discarded these * mbufs. */ m_freem(data); break; } /* * Drop this TLS record's data, but keep * decrypting subsequent records. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; CURVNET_SET(so->so_vnet); so->so_error = EBADMSG; sorwakeup_locked(so); CURVNET_RESTORE(); m_freem(data); SOCKBUF_LOCK(sb); continue; } /* Allocate the control mbuf. */ tgr.tls_type = hdr->tls_type; tgr.tls_vmajor = hdr->tls_vmajor; tgr.tls_vminor = hdr->tls_vminor; tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - trail_len); control = sbcreatecontrol_how(&tgr, sizeof(tgr), TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* sbcut/drop/flush discarded these mbufs. */ MPASS(sb->sb_tlscc == 0); m_freem(data); m_freem(control); break; } /* * Clear the 'dcc' accounting in preparation for * adding the decrypted record. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; SBCHECK(sb); /* If there is no payload, drop all of the data. */ if (tgr.tls_length == htobe16(0)) { m_freem(data); data = NULL; } else { /* Trim header. */ remain = tls->params.tls_hlen; while (remain > 0) { if (data->m_len > remain) { data->m_data += remain; data->m_len -= remain; break; } remain -= data->m_len; data = m_free(data); } /* Trim trailer and clear M_NOTREADY. */ remain = be16toh(tgr.tls_length); m = data; for (m = data; remain > m->m_len; m = m->m_next) { m->m_flags &= ~M_NOTREADY; remain -= m->m_len; } m->m_len = remain; m_freem(m->m_next); m->m_next = NULL; m->m_flags &= ~M_NOTREADY; /* Set EOR on the final mbuf. */ m->m_flags |= M_EOR; } sbappendcontrol_locked(sb, data, control, 0); } sb->sb_flags &= ~SB_TLS_RX_RUNNING; if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) so->so_error = EMSGSIZE; sorwakeup_locked(so); deref: SOCKBUF_UNLOCK_ASSERT(sb); CURVNET_SET(so->so_vnet); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } void ktls_enqueue_to_free(struct mbuf *m) { struct ktls_wq *wq; bool running; /* Mark it for freeing. */ m->m_epg_flags |= EPG_FLAG_2FREE; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } static void * ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m) { void *buf; if (m->m_epg_npgs <= 2) return (NULL); if (ktls_buffer_zone == NULL) return (NULL); if ((u_int)(ticks - wq->lastallocfail) < hz) { /* * Rate-limit allocation attempts after a failure. * ktls_buffer_import() will acquire a per-domain mutex to check * the free page queues and may fail consistently if memory is * fragmented. */ return (NULL); } buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM); if (buf == NULL) wq->lastallocfail = ticks; return (buf); } void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { struct ktls_wq *wq; bool running; KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY)), ("ktls_enqueue: %p not unready & nomap mbuf\n", m)); KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count")); KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf")); m->m_epg_enc_cnt = page_count; /* * Save a pointer to the socket. The caller is responsible * for taking an additional reference via soref(). */ m->m_epg_so = so; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_tx_queued, 1); } #define MAX_TLS_PAGES (1 + btoc(TLS_MAX_MSG_SIZE_V10_2)) static __noinline void ktls_encrypt(struct ktls_wq *wq, struct mbuf *top) { struct ktls_session *tls; struct socket *so; struct mbuf *m; vm_paddr_t parray[MAX_TLS_PAGES + 1]; struct iovec dst_iov[MAX_TLS_PAGES + 2]; vm_page_t pg; void *cbuf; int error, i, len, npages, off, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; /* * Encrypt the TLS records in the chain of mbufs starting with * 'top'. 'total_pages' gives us a total count of pages and is * used to know when we have finished encrypting the TLS * records originally queued with 'top'. * * NB: These mbufs are queued in the socket buffer and * 'm_next' is traversing the mbufs in the socket buffer. The * socket buffer lock is not held while traversing this chain. * Since the mbufs are all marked M_NOTREADY their 'm_next' * pointers should be stable. However, the 'm_next' of the * last mbuf encrypted is not necessarily NULL. It can point * to other mbufs appended while 'top' was on the TLS work * queue. * * Each mbuf holds an entire TLS record. */ error = 0; for (m = top; npages != total_pages; m = m->m_next) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY), ("%p not unready & nomap mbuf (top = %p)\n", m, top)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen, ("page count %d larger than maximum frame length %d", m->m_epg_npgs, ktls_maxlen)); /* * For anonymous mbufs, encryption is done in place. * For file-backed mbufs (from sendfile), anonymous * wired pages are allocated and used as the * encryption destination. */ if ((m->m_epg_flags & EPG_FLAG_ANON) != 0) { error = (*tls->sw_encrypt)(tls, m, NULL, 0); } else { if ((cbuf = ktls_buffer_alloc(wq, m)) != NULL) { len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len - m->m_epg_1st_off; dst_iov[0].iov_base = (char *)cbuf + m->m_epg_1st_off; dst_iov[0].iov_len = len; parray[0] = DMAP_TO_PHYS((vm_offset_t)cbuf); i = 1; } else { off = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs; i++, off = 0) { do { pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED | VM_ALLOC_WAITFAIL); } while (pg == NULL); len = m_epg_pagelen(m, i, off); parray[i] = VM_PAGE_TO_PHYS(pg); dst_iov[i].iov_base = (char *)(void *)PHYS_TO_DMAP( parray[i]) + off; dst_iov[i].iov_len = len; } } KASSERT(i + 1 <= nitems(dst_iov), ("dst_iov is too small")); dst_iov[i].iov_base = m->m_epg_trail; dst_iov[i].iov_len = m->m_epg_trllen; error = (*tls->sw_encrypt)(tls, m, dst_iov, i + 1); /* Free the old pages. */ m->m_ext.ext_free(m); /* Replace them with the new pages. */ if (cbuf != NULL) { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = parray[0] + ptoa(i); /* Contig pages should go back to the cache. */ m->m_ext.ext_free = ktls_free_mext_contig; } else { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = parray[i]; /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; } /* Pages are now writable. */ m->m_epg_flags |= EPG_FLAG_ANON; } if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; } if (__predict_false(m->m_epg_npgs == 0)) { /* TLS 1.0 empty fragment. */ npages++; } else npages += m->m_epg_npgs; /* * Drop a reference to the session now that it is no * longer needed. Existing code depends on encrypted * records having no associated session vs * yet-to-be-encrypted records having an associated * session. */ m->m_epg_tls = NULL; ktls_free(tls); } CURVNET_SET(so->so_vnet); if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(top, total_pages); } SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } static void ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf *m, *n; struct socket *so, *son; STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; if (ktls_bind_threads > 1) { curthread->td_domain.dr_policy = DOMAINSET_PREF(PCPU_GET(domain)); } #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->m_head) && STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_m_head); STAILQ_CONCAT(&local_m_head, &wq->m_head); STAILQ_INIT(&local_so_head); STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) { if (m->m_epg_flags & EPG_FLAG_2FREE) { ktls_free(m->m_epg_tls); m_free_raw(m); } else { ktls_encrypt(wq, m); counter_u64_add(ktls_cnt_tx_queued, -1); } } STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { ktls_decrypt(so); counter_u64_add(ktls_cnt_rx_queued, -1); } } } + +static void +ktls_disable_ifnet_help(void *context, int pending __unused) +{ + struct ktls_session *tls; + struct inpcb *inp; + struct tcpcb *tp; + struct socket *so; + int err; + + tls = context; + inp = tls->inp; + if (inp == NULL) + return; + INP_WLOCK(inp); + so = inp->inp_socket; + MPASS(so != NULL); + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || + (inp->inp_flags2 & INP_FREED)) { + goto out; + } + + if (so->so_snd.sb_tls_info != NULL) + err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW); + else + err = ENXIO; + if (err == 0) { + counter_u64_add(ktls_ifnet_disable_ok, 1); + /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 && + (inp->inp_flags2 & INP_FREED) == 0 && + (tp = intotcpcb(inp)) != NULL && + tp->t_fb->tfb_hwtls_change != NULL) + (*tp->t_fb->tfb_hwtls_change)(tp, 0); + } else { + counter_u64_add(ktls_ifnet_disable_fail, 1); + } + +out: + SOCK_LOCK(so); + sorele(so); + if (!in_pcbrele_wlocked(inp)) + INP_WUNLOCK(inp); + ktls_free(tls); +} + +/* + * Called when re-transmits are becoming a substantial portion of the + * sends on this connection. When this happens, we transition the + * connection to software TLS. This is needed because most inline TLS + * NICs keep crypto state only for in-order transmits. This means + * that to handle a TCP rexmit (which is out-of-order), the NIC must + * re-DMA the entire TLS record up to and including the current + * segment. This means that when re-transmitting the last ~1448 byte + * segment of a 16KB TLS record, we could wind up re-DMA'ing an order + * of magnitude more data than we are sending. This can cause the + * PCIe link to saturate well before the network, which can cause + * output drops, and a general loss of capacity. + */ +void +ktls_disable_ifnet(void *arg) +{ + struct tcpcb *tp; + struct inpcb *inp; + struct socket *so; + struct ktls_session *tls; + + tp = arg; + inp = tp->t_inpcb; + INP_WLOCK_ASSERT(inp); + so = inp->inp_socket; + SOCK_LOCK(so); + tls = so->so_snd.sb_tls_info; + if (tls->disable_ifnet_pending) { + SOCK_UNLOCK(so); + return; + } + + /* + * note that disable_ifnet_pending is never cleared; disabling + * ifnet can only be done once per session, so we never want + * to do it again + */ + + (void)ktls_hold(tls); + in_pcbref(inp); + soref(so); + tls->disable_ifnet_pending = true; + tls->inp = inp; + SOCK_UNLOCK(so); + TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls); + (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task); +} diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index dd30f89896d2..3f72a821e71f 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,1156 +1,1167 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL +#include "opt_kern_tls.h" #include #include +#include #endif #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */ /* Types of ending byte info */ #define TCP_EI_EMPTY_SLOT 0 #define TCP_EI_STATUS_CLIENT_FIN 0x1 #define TCP_EI_STATUS_CLIENT_RST 0x2 #define TCP_EI_STATUS_SERVER_FIN 0x3 #define TCP_EI_STATUS_SERVER_RST 0x4 #define TCP_EI_STATUS_RETRAN 0x5 #define TCP_EI_STATUS_PROGRESS 0x6 #define TCP_EI_STATUS_PERSIST_MAX 0x7 #define TCP_EI_STATUS_KEEP_MAX 0x8 #define TCP_EI_STATUS_DATA_A_CLOSE 0x9 #define TCP_EI_STATUS_RST_IN_FRONT 0xa #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but * for human representation. To check a bit we * take and shift over by 1 minus the value (1-8). */ /************************************************/ #define TCP_EI_BITS_CLIENT_FIN 0x001 #define TCP_EI_BITS_CLIENT_RST 0x002 #define TCP_EI_BITS_SERVER_FIN 0x004 #define TCP_EI_BITS_SERVER_RST 0x008 #define TCP_EI_BITS_RETRAN 0x010 #define TCP_EI_BITS_PROGRESS 0x020 #define TCP_EI_BITS_PRESIST_MAX 0x040 #define TCP_EI_BITS_KEEP_MAX 0x080 #define TCP_EI_BITS_DATA_A_CLO 0x100 #define TCP_EI_BITS_RST_IN_FR 0x200 /* a front state reset */ #define TCP_EI_BITS_2MS_TIMER 0x400 /* 2 MSL timer expired */ #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from the th->th_flags */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int32_t delivered_data; /* Newly acked data from last SACK */ int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */ uint32_t prr_delivered; /* Total bytes delivered using PRR */ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* Count of bytes mbufs on all entries */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; /* RACK and BBR incoming new data was acked */ u_int t_sndtime; /* time last data was sent */ u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_fbyte_in; /* ticks time when first byte queued in */ u_int t_fbyte_out; /* ticks time when first byte queued out */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_blackhole_enter; /* when to enter blackhole detection */ int t_blackhole_exit; /* when to exit blackhole detection */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint32_t t_maxpeakrate; /* max peak rate set by user, in bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ uint64_t t_snd_rxt_bytes; /* total bytes retransmitted */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ union { uint8_t t_end_info_bytes[TCP_END_BYTE_INFO]; uint64_t t_end_info; }; #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 #define TCP_TUNNELING_PORT_DEFAULT 0 /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) #define TCP_TUNNELING_OVERHEAD_MAX 1024 #define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid */ #define TF_WAKESOR 0x00004000 /* wake up receive socket */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_UNUSED0 0x04000000 /* unused */ #define TF_UNUSED1 0x08000000 /* unused */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ uint32_t t_port:16, /* UDP port number if TCPoUDP */ t_unused:16; tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; void *tw_pspare; /* TCP_SIGNATURE */ u_int *tw_spare; /* TCP_SIGNATURE */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ uint64_t _pad[9]; /* 6 UTO, 3 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_add(int statnum, int val); #define KMOD_TCPSTAT_ADD(name, val) \ kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val) #define KMOD_TCPSTAT_INC(name) KMOD_TCPSTAT_ADD(name, 1) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ char xt_cc[TCP_CA_NAME_MAX]; /* (s) */ int64_t spare64[6]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ uint32_t t_snd_cwnd; /* (s) */ uint32_t t_snd_ssthresh; /* (s) */ uint32_t t_maxseg; /* (s) */ uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ uint16_t xt_encaps_port; /* (s) */ int16_t spare16; int32_t spare32[25]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_tolerate_missing_ts); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_newsack); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_tolerate_missing_ts VNET(tcp_tolerate_missing_ts) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_newsack VNET(tcp_do_newsack) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_handle_wakeup(struct tcpcb *, struct socket *); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_extra_mbuf; extern counter_u64_t tcp_would_have_but; extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; #ifdef NETFLIX_EXP_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); void tcp_decrement_paced_conn(void); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } static inline void tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, - uint8_t is_tlp, int hw_tls __unused) + uint8_t is_tlp, int hw_tls) { + uint64_t rexmit_percent; + if (is_tlp) { tp->t_sndtlppack++; tp->t_sndtlpbyte += len; } /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ if (is_rxt) tp->t_snd_rxt_bytes += len; else tp->t_sndbytes += len; + + if (hw_tls && is_rxt) { + rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); + if (rexmit_percent > ktls_ifnet_max_rexmit_pct) + ktls_disable_ifnet(tp); + } + } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h index b28c94965c97..7fd8831878b4 100644 --- a/sys/sys/ktls.h +++ b/sys/sys/ktls.h @@ -1,235 +1,248 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_KTLS_H_ #define _SYS_KTLS_H_ #ifdef _KERNEL #include #include #endif struct tls_record_layer { uint8_t tls_type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; uint8_t tls_data[0]; } __attribute__ ((packed)); #define TLS_MAX_MSG_SIZE_V10_2 16384 #define TLS_MAX_PARAM_SIZE 1024 /* Max key/mac/iv in sockopt */ #define TLS_AEAD_GCM_LEN 4 #define TLS_1_3_GCM_IV_LEN 12 #define TLS_CHACHA20_IV_LEN 12 #define TLS_CBC_IMPLICIT_IV_LEN 16 /* Type values for the record layer */ #define TLS_RLTYPE_APP 23 /* * Nonce for GCM for TLS 1.2 per RFC 5288. */ struct tls_nonce_data { uint8_t fixed[TLS_AEAD_GCM_LEN]; uint64_t seq; } __packed; /* * AEAD additional data format for TLS 1.2 per RFC 5246. */ struct tls_aead_data { uint64_t seq; /* In network order */ uint8_t type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; } __packed; /* * AEAD additional data format for TLS 1.3 per RFC 8446. */ struct tls_aead_data_13 { uint8_t type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; } __packed; /* * Stream Cipher MAC additional data input. This does not match the * exact data on the wire (the sequence number is not placed on the * wire, and any explicit IV after the record header is not covered by * the MAC). */ struct tls_mac_data { uint64_t seq; uint8_t type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; } __packed; #define TLS_MAJOR_VER_ONE 3 #define TLS_MINOR_VER_ZERO 1 /* 3, 1 */ #define TLS_MINOR_VER_ONE 2 /* 3, 2 */ #define TLS_MINOR_VER_TWO 3 /* 3, 3 */ #define TLS_MINOR_VER_THREE 4 /* 3, 4 */ /* For TCP_TXTLS_ENABLE and TCP_RXTLS_ENABLE. */ #ifdef _KERNEL struct tls_enable_v0 { const uint8_t *cipher_key; const uint8_t *iv; /* Implicit IV. */ const uint8_t *auth_key; int cipher_algorithm; /* e.g. CRYPTO_AES_CBC */ int cipher_key_len; int iv_len; int auth_algorithm; /* e.g. CRYPTO_SHA2_256_HMAC */ int auth_key_len; int flags; uint8_t tls_vmajor; uint8_t tls_vminor; }; #endif struct tls_enable { const uint8_t *cipher_key; const uint8_t *iv; /* Implicit IV. */ const uint8_t *auth_key; int cipher_algorithm; /* e.g. CRYPTO_AES_CBC */ int cipher_key_len; int iv_len; int auth_algorithm; /* e.g. CRYPTO_SHA2_256_HMAC */ int auth_key_len; int flags; uint8_t tls_vmajor; uint8_t tls_vminor; uint8_t rec_seq[8]; }; /* Structure for TLS_GET_RECORD. */ struct tls_get_record { /* TLS record header. */ uint8_t tls_type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; }; #ifdef _KERNEL struct tls_session_params { uint8_t *cipher_key; uint8_t *auth_key; uint8_t iv[TLS_CBC_IMPLICIT_IV_LEN]; int cipher_algorithm; int auth_algorithm; uint16_t cipher_key_len; uint16_t iv_len; uint16_t auth_key_len; uint16_t max_frame_len; uint8_t tls_vmajor; uint8_t tls_vminor; uint8_t tls_hlen; uint8_t tls_tlen; uint8_t tls_bs; uint8_t flags; }; /* Used in APIs to request RX vs TX sessions. */ #define KTLS_TX 1 #define KTLS_RX 2 struct iovec; struct ktls_session; struct m_snd_tag; struct mbuf; struct sockbuf; struct socket; struct ktls_session { union { int (*sw_encrypt)(struct ktls_session *tls, struct mbuf *m, struct iovec *dst, int iovcnt); int (*sw_decrypt)(struct ktls_session *tls, const struct tls_record_layer *hdr, struct mbuf *m, uint64_t seqno, int *trailer_len); }; union { void *cipher; struct m_snd_tag *snd_tag; }; struct tls_session_params params; u_int wq_index; volatile u_int refcount; int mode; - bool reset_pending; struct task reset_tag_task; + struct task disable_ifnet_task; struct inpcb *inp; + bool reset_pending; + bool disable_ifnet_pending; } __aligned(CACHE_LINE_SIZE); void ktls_check_rx(struct sockbuf *sb); int ktls_enable_rx(struct socket *so, struct tls_enable *en); int ktls_enable_tx(struct socket *so, struct tls_enable *en); void ktls_destroy(struct ktls_session *tls); void ktls_frame(struct mbuf *m, struct ktls_session *tls, int *enqueue_cnt, uint8_t record_type); void ktls_ocf_free(struct ktls_session *tls); int ktls_ocf_try(struct socket *so, struct ktls_session *tls, int direction); void ktls_seq(struct sockbuf *sb, struct mbuf *m); void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count); void ktls_enqueue_to_free(struct mbuf *m); int ktls_get_rx_mode(struct socket *so); int ktls_set_tx_mode(struct socket *so, int mode); int ktls_get_tx_mode(struct socket *so); int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls); #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate); #endif static inline struct ktls_session * ktls_hold(struct ktls_session *tls) { if (tls != NULL) refcount_acquire(&tls->refcount); return (tls); } static inline void ktls_free(struct ktls_session *tls) { if (refcount_release(&tls->refcount)) ktls_destroy(tls); } +#ifdef KERN_TLS +extern unsigned int ktls_ifnet_max_rexmit_pct; +void ktls_disable_ifnet(void *arg); +#else +#define ktls_ifnet_max_rexmit_pct 1 +inline void +ktls_disable_ifnet(void *arg __unused) +{ +} +#endif + #endif /* !_KERNEL */ #endif /* !_SYS_KTLS_H_ */