Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F145034481
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
168 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index fa2c60b93cfa..08097ea8c1b9 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1,3594 +1,3543 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2007-2009 Robert N. M. Watson
* Copyright (c) 2010-2011 Juniper Networks, Inc.
* Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
* contract to Juniper Networks, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include "opt_ddb.h"
#include "opt_ipsec.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ratelimit.h"
#include "opt_route.h"
#include "opt_rss.h"
#include <sys/param.h>
#include <sys/hash.h>
#include <sys/systm.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/eventhandler.h>
#include <sys/domain.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/smp.h>
#include <sys/smr.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <vm/uma.h>
#include <vm/vm.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_private.h>
#include <net/if_types.h>
#include <net/if_llatbl.h>
#include <net/route.h>
#include <net/rss_config.h>
#include <net/vnet.h>
#if defined(INET) || defined(INET6)
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_pcb_var.h>
#include <netinet/tcp.h>
#ifdef INET
#include <netinet/in_var.h>
#include <netinet/in_fib.h>
#endif
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#endif /* INET6 */
#include <net/route/nhop.h>
#endif
#include <netipsec/ipsec_support.h>
#include <security/mac/mac_framework.h>
#define INPCBLBGROUP_SIZMIN 8
#define INPCBLBGROUP_SIZMAX 256
#define INP_FREED 0x00000200 /* Went through in_pcbfree(). */
#define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */
/*
* These configure the range of local port addresses assigned to
* "unspecified" outgoing connections/packets/whatever.
*/
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
/*
* Reserved ports accessible only to root. There are significant
* security considerations that must be accounted for when changing these,
* but the security benefits can be great. Please be careful.
*/
VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
VNET_DEFINE(int, ipport_reservedlow);
/* Enable random ephemeral port allocation by default. */
VNET_DEFINE(int, ipport_randomized) = 1;
#ifdef INET
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg,
int lookupflags, uint8_t numa_domain, int fib);
#define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
else if ((var) > (max)) { (var) = (max); }
static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
{
int error;
error = sysctl_handle_int(oidp, arg1, arg2, req);
if (error == 0) {
RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
}
return (error);
}
#undef RANGECHK
static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"IP Ports");
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
"");
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
"");
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
"");
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
"");
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
"");
SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
"");
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
&VNET_NAME(ipport_reservedhigh), 0, "");
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
#ifdef RATELIMIT
counter_u64_t rate_limit_new;
counter_u64_t rate_limit_chg;
counter_u64_t rate_limit_active;
counter_u64_t rate_limit_alloc_fail;
counter_u64_t rate_limit_set_ok;
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"IP Rate Limiting");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
&rate_limit_active, "Active rate limited connections");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
&rate_limit_alloc_fail, "Rate limited connection failures");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
&rate_limit_set_ok, "Rate limited setting succeeded");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
&rate_limit_new, "Total Rate limit new attempts");
SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
&rate_limit_chg, "Total Rate limited change attempts");
#endif /* RATELIMIT */
#endif /* INET */
VNET_DEFINE(uint32_t, in_pcbhashseed);
static void
in_pcbhashseed_init(void)
{
V_in_pcbhashseed = arc4random();
}
VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
in_pcbhashseed_init, NULL);
#ifdef INET
VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1;
#define V_connect_inaddr_wild VNET(connect_inaddr_wild)
SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
"Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
#endif
static void in_pcbremhash(struct inpcb *);
/*
* in_pcb.c: manage the Protocol Control Blocks.
*
* NOTE: It is assumed that most of these functions will be called with
* the pcbinfo lock held, and often, the inpcb lock held, as these utility
* functions often modify hash chains or addresses in pcbs.
*/
static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
{
struct inpcblbgroup *grp;
size_t bytes;
bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
if (grp == NULL)
return (NULL);
LIST_INIT(&grp->il_pending);
grp->il_cred = crhold(cred);
grp->il_vflag = vflag;
grp->il_lport = port;
grp->il_numa_domain = numa_domain;
grp->il_fibnum = fib;
grp->il_dependladdr = *addr;
grp->il_inpsiz = size;
return (grp);
}
static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)
{
struct inpcblbgroup *grp;
grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
crfree(grp->il_cred);
free(grp, M_PCB);
}
static void
in_pcblbgroup_free(struct inpcblbgroup *grp)
{
KASSERT(LIST_EMPTY(&grp->il_pending),
("local group %p still has pending inps", grp));
CK_LIST_REMOVE(grp, il_list);
NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
}
static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo;
struct inpcblbgroup *grp;
struct inpcblbgrouphead *hdr;
INP_LOCK_ASSERT(inp);
pcbinfo = inp->inp_pcbinfo;
INP_HASH_LOCK_ASSERT(pcbinfo);
hdr = &pcbinfo->ipi_lbgrouphashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
CK_LIST_FOREACH(grp, hdr, il_list) {
struct inpcb *inp1;
for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
if (inp == grp->il_inp[i])
goto found;
}
LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
if (inp == inp1)
goto found;
}
}
found:
return (grp);
}
static void
in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
{
KASSERT(grp->il_inpcnt < grp->il_inpsiz,
("invalid local group size %d and count %d", grp->il_inpsiz,
grp->il_inpcnt));
INP_WLOCK_ASSERT(inp);
if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
!SOLISTENING(inp->inp_socket)) {
/*
* If this is a TCP socket, it should not be visible to lbgroup
* lookups until listen() has been called.
*/
LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
grp->il_pendcnt++;
} else {
grp->il_inp[grp->il_inpcnt] = inp;
/*
* Synchronize with in_pcblookup_lbgroup(): make sure that we
* don't expose a null slot to the lookup path.
*/
atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
}
inp->inp_flags |= INP_INLBGROUP;
}
static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
struct inpcblbgroup *old_grp, int size)
{
struct inpcblbgroup *grp;
int i;
grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
old_grp->il_lport, &old_grp->il_dependladdr, size,
old_grp->il_numa_domain, old_grp->il_fibnum);
if (grp == NULL)
return (NULL);
KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
("invalid new local group size %d and old local group count %d",
grp->il_inpsiz, old_grp->il_inpcnt));
for (i = 0; i < old_grp->il_inpcnt; ++i)
grp->il_inp[i] = old_grp->il_inp[i];
grp->il_inpcnt = old_grp->il_inpcnt;
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
inp_lbgroup_list);
grp->il_pendcnt = old_grp->il_pendcnt;
old_grp->il_pendcnt = 0;
in_pcblbgroup_free(old_grp);
return (grp);
}
/*
* Add PCB to load balance group for SO_REUSEPORT_LB option.
*/
static int
in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
{
const static struct timeval interval = { 60, 0 };
static struct timeval lastprint;
struct inpcbinfo *pcbinfo;
struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
int fib;
pcbinfo = inp->inp_pcbinfo;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
#ifdef INET6
/*
* Don't allow IPv4 mapped INET6 wild socket.
*/
if ((inp->inp_vflag & INP_IPV4) &&
inp->inp_laddr.s_addr == INADDR_ANY &&
INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
return (0);
}
#endif
idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
CK_LIST_FOREACH(grp, hdr, il_list) {
if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
grp->il_vflag == inp->inp_vflag &&
grp->il_lport == inp->inp_lport &&
grp->il_numa_domain == numa_domain &&
grp->il_fibnum == fib &&
memcmp(&grp->il_dependladdr,
&inp->inp_inc.inc_ie.ie_dependladdr,
sizeof(grp->il_dependladdr)) == 0) {
break;
}
}
if (grp == NULL) {
/* Create new load balance group. */
grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
INPCBLBGROUP_SIZMIN, numa_domain, fib);
if (grp == NULL)
return (ENOBUFS);
in_pcblbgroup_insert(grp, inp);
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
} else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
if (ratecheck(&lastprint, &interval))
printf("lb group port %d, limit reached\n",
ntohs(grp->il_lport));
return (0);
}
/* Expand this local group. */
grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
if (grp == NULL)
return (ENOBUFS);
in_pcblbgroup_insert(grp, inp);
} else {
in_pcblbgroup_insert(grp, inp);
}
return (0);
}
/*
* Remove PCB from load balance group.
*/
static void
in_pcbremlbgrouphash(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo;
struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
struct inpcb *inp1;
int i;
pcbinfo = inp->inp_pcbinfo;
INP_WLOCK_ASSERT(inp);
MPASS(inp->inp_flags & INP_INLBGROUP);
INP_HASH_WLOCK_ASSERT(pcbinfo);
hdr = &pcbinfo->ipi_lbgrouphashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
CK_LIST_FOREACH(grp, hdr, il_list) {
for (i = 0; i < grp->il_inpcnt; ++i) {
if (grp->il_inp[i] != inp)
continue;
if (grp->il_inpcnt == 1 &&
LIST_EMPTY(&grp->il_pending)) {
/* We are the last, free this local group. */
in_pcblbgroup_free(grp);
} else {
grp->il_inp[i] =
grp->il_inp[grp->il_inpcnt - 1];
/*
* Synchronize with in_pcblookup_lbgroup().
*/
atomic_store_rel_int(&grp->il_inpcnt,
grp->il_inpcnt - 1);
}
inp->inp_flags &= ~INP_INLBGROUP;
return;
}
LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
if (inp == inp1) {
LIST_REMOVE(inp, inp_lbgroup_list);
grp->il_pendcnt--;
inp->inp_flags &= ~INP_INLBGROUP;
return;
}
}
}
__assert_unreachable();
}
int
in_pcblbgroup_numa(struct inpcb *inp, int arg)
{
struct inpcbinfo *pcbinfo;
int error;
uint8_t numa_domain;
switch (arg) {
case TCP_REUSPORT_LB_NUMA_NODOM:
numa_domain = M_NODOM;
break;
case TCP_REUSPORT_LB_NUMA_CURDOM:
numa_domain = PCPU_GET(domain);
break;
default:
if (arg < 0 || arg >= vm_ndomains)
return (EINVAL);
numa_domain = arg;
}
pcbinfo = inp->inp_pcbinfo;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK(pcbinfo);
if (in_pcblbgroup_find(inp) != NULL) {
/* Remove it from the old group. */
in_pcbremlbgrouphash(inp);
/* Add it to the new group based on numa domain. */
in_pcbinslbgrouphash(inp, numa_domain);
error = 0;
} else {
error = ENOENT;
}
INP_HASH_WUNLOCK(pcbinfo);
return (error);
}
/* Make sure it is safe to use hashinit(9) on CK_LIST. */
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
/*
* Initialize an inpcbinfo - a per-VNET instance of connections db.
*/
void
in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
u_int hash_nelements, u_int porthash_nelements)
{
mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
NULL, MTX_DEF);
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
CK_LIST_INIT(&pcbinfo->ipi_listhead);
pcbinfo->ipi_count = 0;
pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
&pcbinfo->ipi_hashmask);
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_lbgrouphashmask);
pcbinfo->ipi_zone = pcbstor->ips_zone;
- pcbinfo->ipi_portzone = pcbstor->ips_portzone;
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
}
/*
* Destroy an inpcbinfo.
*/
void
in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
{
KASSERT(pcbinfo->ipi_count == 0,
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
pcbinfo->ipi_lbgrouphashmask);
mtx_destroy(&pcbinfo->ipi_hash_lock);
mtx_destroy(&pcbinfo->ipi_lock);
}
/*
* Initialize a pcbstorage - per protocol zones to allocate inpcbs.
*/
static void inpcb_fini(void *, int);
void
in_pcbstorage_init(void *arg)
{
struct inpcbstorage *pcbstor = arg;
pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
- pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
- sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
- uma_zone_set_smr(pcbstor->ips_portzone,
- uma_zone_get_smr(pcbstor->ips_zone));
}
/*
* Destroy a pcbstorage - used by unloadable protocols.
*/
void
in_pcbstorage_destroy(void *arg)
{
struct inpcbstorage *pcbstor = arg;
uma_zdestroy(pcbstor->ips_zone);
- uma_zdestroy(pcbstor->ips_portzone);
}
/*
* Allocate a PCB and associate it with the socket.
* On success return with the PCB locked.
*/
int
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
{
struct inpcb *inp;
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
int error;
#endif
inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
return (ENOBUFS);
bzero(&inp->inp_start_zero, inp_zero_size);
#ifdef NUMA
inp->inp_numa_domain = M_NODOM;
#endif
inp->inp_pcbinfo = pcbinfo;
inp->inp_socket = so;
inp->inp_cred = crhold(so->so_cred);
inp->inp_inc.inc_fibnum = so->so_fibnum;
#ifdef MAC
error = mac_inpcb_init(inp, M_NOWAIT);
if (error != 0)
goto out;
mac_inpcb_create(so, inp);
#endif
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
error = ipsec_init_pcbpolicy(inp);
if (error != 0) {
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
goto out;
}
#endif /*IPSEC*/
#ifdef INET6
if (INP_SOCKAF(so) == AF_INET6) {
inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
if (V_ip6_v6only)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
#ifdef INET
else
inp->inp_vflag |= INP_IPV4;
#endif
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
inp->in6p_hops = -1; /* use kernel default */
}
#endif
#if defined(INET) && defined(INET6)
else
#endif
#ifdef INET
inp->inp_vflag |= INP_IPV4;
#endif
inp->inp_smr = SMR_SEQ_INVALID;
/*
* Routes in inpcb's can cache L2 as well; they are guaranteed
* to be cleaned up.
*/
inp->inp_route.ro_flags = RT_LLE_CACHE;
refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
INP_WLOCK(inp);
INP_INFO_WLOCK(pcbinfo);
pcbinfo->ipi_count++;
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
INP_INFO_WUNLOCK(pcbinfo);
so->so_pcb = inp;
return (0);
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
out:
crfree(inp->inp_cred);
#ifdef INVARIANTS
inp->inp_cred = NULL;
#endif
uma_zfree_smr(pcbinfo->ipi_zone, inp);
return (error);
#endif
}
#ifdef INET
int
in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
struct ucred *cred)
{
int anonport, error;
KASSERT(sin == NULL || sin->sin_family == AF_INET,
("%s: invalid address family for %p", __func__, sin));
KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
("%s: invalid address length for %p", __func__, sin));
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
return (EINVAL);
anonport = sin == NULL || sin->sin_port == 0;
error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
&inp->inp_lport, flags, cred);
if (error)
return (error);
if (in_pcbinshash(inp) != 0) {
inp->inp_laddr.s_addr = INADDR_ANY;
inp->inp_lport = 0;
inp->inp_flags &= ~INP_BOUNDFIB;
return (EAGAIN);
}
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
#endif
#if defined(INET) || defined(INET6)
/*
* Assign a local port like in_pcb_lport(), but also used with connect()
* and a foreign address and port. If fsa is non-NULL, choose a local port
* that is unused with those, otherwise one that is completely unused.
* lsa can be NULL for IPv6.
*/
int
in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
int lookupflags)
{
struct inpcbinfo *pcbinfo;
struct inpcb *tmpinp;
unsigned short *lastport;
int count, error;
u_short aux, first, last, lport;
#ifdef INET
struct in_addr laddr, faddr;
#endif
#ifdef INET6
struct in6_addr *laddr6, *faddr6;
#endif
pcbinfo = inp->inp_pcbinfo;
/*
* Because no actual state changes occur here, a global write lock on
* the pcbinfo isn't required.
*/
INP_LOCK_ASSERT(inp);
INP_HASH_LOCK_ASSERT(pcbinfo);
if (inp->inp_flags & INP_HIGHPORT) {
first = V_ipport_hifirstauto; /* sysctl */
last = V_ipport_hilastauto;
lastport = &pcbinfo->ipi_lasthi;
} else if (inp->inp_flags & INP_LOWPORT) {
error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
if (error)
return (error);
first = V_ipport_lowfirstauto; /* 1023 */
last = V_ipport_lowlastauto; /* 600 */
lastport = &pcbinfo->ipi_lastlow;
} else {
first = V_ipport_firstauto; /* sysctl */
last = V_ipport_lastauto;
lastport = &pcbinfo->ipi_lastport;
}
/*
* Instead of having two loops further down counting up or down
* make sure that first is always <= last and go with only one
* code path implementing all logic.
*/
if (first > last) {
aux = first;
first = last;
last = aux;
}
#ifdef INET
laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */
if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
if (lsa != NULL)
laddr = ((struct sockaddr_in *)lsa)->sin_addr;
if (fsa != NULL)
faddr = ((struct sockaddr_in *)fsa)->sin_addr;
}
#endif
#ifdef INET6
laddr6 = NULL;
if ((inp->inp_vflag & INP_IPV6) != 0) {
if (lsa != NULL)
laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
if (fsa != NULL)
faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
}
#endif
tmpinp = NULL;
lport = *lportp;
if (V_ipport_randomized)
*lastport = first + (arc4random() % (last - first));
count = last - first;
do {
if (count-- < 0) /* completely used? */
return (EADDRNOTAVAIL);
++*lastport;
if (*lastport < first || *lastport > last)
*lastport = first;
lport = htons(*lastport);
if (fsa != NULL) {
#ifdef INET
if (lsa->sa_family == AF_INET) {
tmpinp = in_pcblookup_hash_locked(pcbinfo,
faddr, fport, laddr, lport, lookupflags,
M_NODOM, RT_ALL_FIBS);
}
#endif
#ifdef INET6
if (lsa->sa_family == AF_INET6) {
tmpinp = in6_pcblookup_hash_locked(pcbinfo,
faddr6, fport, laddr6, lport, lookupflags,
M_NODOM, RT_ALL_FIBS);
}
#endif
} else {
#ifdef INET6
if ((inp->inp_vflag & INP_IPV6) != 0) {
tmpinp = in6_pcblookup_local(pcbinfo,
&inp->in6p_laddr, lport, RT_ALL_FIBS,
lookupflags, cred);
#ifdef INET
if (tmpinp == NULL &&
(inp->inp_vflag & INP_IPV4))
tmpinp = in_pcblookup_local(pcbinfo,
laddr, lport, RT_ALL_FIBS,
lookupflags, cred);
#endif
}
#endif
#if defined(INET) && defined(INET6)
else
#endif
#ifdef INET
tmpinp = in_pcblookup_local(pcbinfo, laddr,
lport, RT_ALL_FIBS, lookupflags, cred);
#endif
}
} while (tmpinp != NULL);
*lportp = lport;
return (0);
}
/*
* Select a local port (number) to use.
*/
int
in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
struct ucred *cred, int lookupflags)
{
struct sockaddr_in laddr;
if (laddrp) {
bzero(&laddr, sizeof(laddr));
laddr.sin_family = AF_INET;
laddr.sin_addr = *laddrp;
}
return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
NULL, lportp, NULL, 0, cred, lookupflags));
}
#endif /* INET || INET6 */
#ifdef INET
/*
* Determine whether the inpcb can be bound to the specified address/port tuple.
*/
static int
in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
const u_short lport, const int fib, int sooptions, int lookupflags,
struct ucred *cred)
{
int reuseport, reuseport_lb;
INP_LOCK_ASSERT(inp);
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
reuseport = (sooptions & SO_REUSEPORT);
reuseport_lb = (sooptions & SO_REUSEPORT_LB);
if (IN_MULTICAST(ntohl(laddr.s_addr))) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR | SO_REUSEPORT;
/*
* XXX: How to deal with SO_REUSEPORT_LB here?
* Treat same as SO_REUSEPORT for now.
*/
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
} else if (!in_nullhost(laddr)) {
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_len = sizeof(sin);
sin.sin_addr = laddr;
/*
* Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
return (EADDRNOTAVAIL);
}
if (lport != 0) {
struct inpcb *t;
if (ntohs(lport) <= V_ipport_reservedhigh &&
ntohs(lport) >= V_ipport_reservedlow &&
priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
return (EACCES);
if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
/*
* If a socket owned by a different user is already
* bound to this port, fail. In particular, SO_REUSE*
* can only be used to share a port among sockets owned
* by the same user.
*
* However, we can share a port with a connected socket
* which has a unique 4-tuple.
*/
t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
if (t != NULL &&
(inp->inp_socket->so_type != SOCK_STREAM ||
in_nullhost(t->inp_faddr)) &&
(inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
return (EADDRINUSE);
}
t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
lookupflags, cred);
if (t != NULL && ((reuseport | reuseport_lb) &
t->inp_socket->so_options) == 0) {
#ifdef INET6
if (!in_nullhost(laddr) ||
!in_nullhost(t->inp_laddr) ||
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
return (EADDRINUSE);
}
}
return (0);
}
/*
* Set up a bind operation on a PCB, performing port allocation
* as required, but do not actually modify the PCB. Callers can
* either complete the bind by setting inp_laddr/inp_lport and
* calling in_pcbinshash(), or they can just use the resulting
* port and address to authorise the sending of a once-off packet.
*
* On error, the values of *laddrp and *lportp are not changed.
*/
int
in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
u_short *lportp, int flags, struct ucred *cred)
{
struct socket *so = inp->inp_socket;
struct in_addr laddr;
u_short lport = 0;
int error, fib, lookupflags, sooptions;
/*
* No state changes, so read locks are sufficient here.
*/
INP_LOCK_ASSERT(inp);
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
laddr.s_addr = *laddrp;
if (sin != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
lookupflags = 0;
sooptions = atomic_load_int(&so->so_options);
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (sin == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
return (error);
} else {
KASSERT(sin->sin_family == AF_INET,
("%s: invalid family for address %p", __func__, sin));
KASSERT(sin->sin_len == sizeof(*sin),
("%s: invalid length for address %p", __func__, sin));
error = prison_local_ip4(cred, &sin->sin_addr);
if (error)
return (error);
if (sin->sin_port != *lportp) {
/* Don't allow the port to change. */
if (*lportp != 0)
return (EINVAL);
lport = sin->sin_port;
}
laddr = sin->sin_addr;
fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
RT_ALL_FIBS;
/* See if this address/port combo is available. */
error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
lookupflags, cred);
if (error != 0)
return (error);
}
if (*lportp != 0)
lport = *lportp;
if (lport == 0) {
error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
if (error != 0)
return (error);
}
*laddrp = laddr.s_addr;
*lportp = lport;
if ((flags & INPBIND_FIB) != 0)
inp->inp_flags |= INP_BOUNDFIB;
return (0);
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
{
struct in_addr laddr, faddr;
u_short lport;
int error;
bool anonport;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
KASSERT(in_nullhost(inp->inp_faddr),
("%s: inp is already connected", __func__));
KASSERT(sin->sin_family == AF_INET,
("%s: invalid address family for %p", __func__, sin));
KASSERT(sin->sin_len == sizeof(*sin),
("%s: invalid address length for %p", __func__, sin));
if (sin->sin_port == 0)
return (EADDRNOTAVAIL);
anonport = (inp->inp_lport == 0);
if (__predict_false(in_broadcast(sin->sin_addr))) {
if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
return (ENETUNREACH);
/*
* If the destination address is INADDR_ANY, use the primary
* local address. If the supplied address is INADDR_BROADCAST,
* and the primary interface supports broadcast, choose the
* broadcast address for that interface.
*/
if (in_nullhost(sin->sin_addr)) {
faddr =
IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
if ((error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (sin->sin_addr.s_addr == INADDR_BROADCAST) {
if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
& IFF_BROADCAST)
faddr = satosin(&CK_STAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
else
faddr = sin->sin_addr;
}
} else
faddr = sin->sin_addr;
if (in_nullhost(inp->inp_laddr)) {
error = in_pcbladdr(inp, &faddr, &laddr, cred);
/*
* If the destination address is multicast and an outgoing
* interface has been set as a multicast option, prefer the
* address of that interface as our source address.
*/
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
inp->inp_moptions != NULL &&
inp->inp_moptions->imo_multicast_ifp != NULL) {
struct ifnet *ifp =
inp->inp_moptions->imo_multicast_ifp;
struct in_ifaddr *ia;
CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (ia->ia_ifp == ifp &&
prison_check_ip4(cred,
&ia->ia_addr.sin_addr) == 0)
break;
}
if (ia == NULL)
return (EADDRNOTAVAIL);
laddr = ia->ia_addr.sin_addr;
error = 0;
}
if (error)
return (error);
} else
laddr = inp->inp_laddr;
if (anonport) {
struct sockaddr_in lsin = {
.sin_family = AF_INET,
.sin_addr = laddr,
};
struct sockaddr_in fsin = {
.sin_family = AF_INET,
.sin_addr = faddr,
};
error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
&lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
INPLOOKUP_WILDCARD);
if (error)
return (error);
} else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
NULL)
return (EADDRINUSE);
else
lport = inp->inp_lport;
inp->inp_faddr = faddr;
inp->inp_fport = sin->sin_port;
/* Do the initial binding of the local address if required. */
if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
inp->inp_lport = lport;
inp->inp_laddr = laddr;
if (in_pcbinshash(inp) != 0) {
inp->inp_laddr.s_addr = inp->inp_faddr.s_addr =
INADDR_ANY;
inp->inp_lport = inp->inp_fport = 0;
return (EAGAIN);
}
} else {
inp->inp_lport = lport;
inp->inp_laddr = laddr;
if ((inp->inp_flags & INP_INHASHLIST) != 0)
in_pcbrehash(inp);
else
in_pcbinshash(inp);
}
#ifdef ROUTE_MPATH
if (CALC_FLOWID_OUTBOUND) {
uint32_t hash_val, hash_type;
hash_val = fib4_calc_software_hash(inp->inp_laddr,
inp->inp_faddr, 0, sin->sin_port,
inp->inp_socket->so_proto->pr_protocol, &hash_type);
inp->inp_flowid = hash_val;
inp->inp_flowtype = hash_type;
}
#endif
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
/*
* Do proper source address selection on an unbound socket in case
* of connect. Take jails into account as well.
*/
int
in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
struct in_addr *laddr, struct ucred *cred)
{
struct ifaddr *ifa;
struct sockaddr *sa;
struct sockaddr_in *sin, dst;
struct nhop_object *nh;
int error;
NET_EPOCH_ASSERT();
KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
/*
* Bypass source address selection and use the primary jail IP
* if requested.
*/
if (!prison_saddrsel_ip4(cred, laddr))
return (0);
error = 0;
nh = NULL;
bzero(&dst, sizeof(dst));
sin = &dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
sin->sin_addr.s_addr = faddr->s_addr;
/*
* If route is known our src addr is taken from the i/f,
* else punt.
*
* Find out route to destination.
*/
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
0, NHR_NONE, 0);
/*
* If we found a route, use the address corresponding to
* the outgoing interface.
*
* Otherwise assume faddr is reachable on a directly connected
* network and try to find a corresponding interface to take
* the source address from.
*/
if (nh == NULL || nh->nh_ifp == NULL) {
struct in_ifaddr *ia;
struct ifnet *ifp;
ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
inp->inp_socket->so_fibnum));
if (ia == NULL) {
ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
inp->inp_socket->so_fibnum));
}
if (ia == NULL) {
error = ENETUNREACH;
goto done;
}
if (!prison_flag(cred, PR_IP4)) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
ifp = ia->ia_ifp;
ia = NULL;
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
/*
* If the outgoing interface on the route found is not
* a loopback interface, use the address from that interface.
* In case of jails do those three steps:
* 1. check if the interface address belongs to the jail. If so use it.
* 2. check if we have any address on the outgoing interface
* belonging to this jail. If so use it.
* 3. as a last resort return the 'default' jail address.
*/
if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
struct in_ifaddr *ia;
struct ifnet *ifp;
/* If not jailed, use the default returned. */
if (!prison_flag(cred, PR_IP4)) {
ia = (struct in_ifaddr *)nh->nh_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* Jailed. */
/* 1. Check if the iface address belongs to the jail. */
sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)nh->nh_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/*
* 2. Check if we have any address on the outgoing interface
* belonging to this jail.
*/
ia = NULL;
ifp = nh->nh_ifp;
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
/*
* The outgoing interface is marked with 'loopback net', so a route
* to ourselves is here.
* Try to find the interface of the destination address and then
* take the address from there. That interface is not necessarily
* a loopback interface.
* In case of jails, check that it is an address of the jail
* and if we cannot find, fall back to the 'default' jail address.
*/
if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
struct in_ifaddr *ia;
ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
inp->inp_socket->so_fibnum));
if (ia == NULL)
ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
inp->inp_socket->so_fibnum));
if (ia == NULL)
ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
if (!prison_flag(cred, PR_IP4)) {
if (ia == NULL) {
error = ENETUNREACH;
goto done;
}
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* Jailed. */
if (ia != NULL) {
struct ifnet *ifp;
ifp = ia->ia_ifp;
ia = NULL;
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred,
&sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
}
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
done:
if (error == 0 && laddr->s_addr == INADDR_ANY)
return (EHOSTUNREACH);
return (error);
}
void
in_pcbdisconnect(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp %p was already disconnected", __func__, inp));
in_pcbremhash_locked(inp);
/* See the comment in in_pcbinshash(). */
inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
inp->inp_laddr.s_addr = INADDR_ANY;
inp->inp_faddr.s_addr = INADDR_ANY;
inp->inp_fport = 0;
}
#endif /* INET */
void
in_pcblisten(struct inpcb *inp)
{
struct inpcblbgroup *grp;
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_INLBGROUP) != 0) {
struct inpcbinfo *pcbinfo;
pcbinfo = inp->inp_pcbinfo;
INP_HASH_WLOCK(pcbinfo);
grp = in_pcblbgroup_find(inp);
LIST_REMOVE(inp, inp_lbgroup_list);
grp->il_pendcnt--;
in_pcblbgroup_insert(grp, inp);
INP_HASH_WUNLOCK(pcbinfo);
}
}
/*
* inpcb hash lookups are protected by SMR section.
*
* Once desired pcb has been found, switching from SMR section to a pcb
* lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
* here because SMR is a critical section.
* In 99%+ cases inp_smr_lock() would obtain the lock immediately.
*/
void
inp_lock(struct inpcb *inp, const inp_lookup_t lock)
{
lock == INPLOOKUP_RLOCKPCB ?
rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
}
void
inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
{
lock == INPLOOKUP_RLOCKPCB ?
rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
}
int
inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
{
return (lock == INPLOOKUP_RLOCKPCB ?
rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
}
static inline bool
_inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
{
MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
if (__predict_true(inp_trylock(inp, lock))) {
if (__predict_false(inp->inp_flags & ignflags)) {
smr_exit(inp->inp_pcbinfo->ipi_smr);
inp_unlock(inp, lock);
return (false);
}
smr_exit(inp->inp_pcbinfo->ipi_smr);
return (true);
}
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
smr_exit(inp->inp_pcbinfo->ipi_smr);
inp_lock(inp, lock);
if (__predict_false(in_pcbrele(inp, lock)))
return (false);
/*
* inp acquired through refcount & lock for sure didn't went
* through uma_zfree(). However, it may have already went
* through in_pcbfree() and has another reference, that
* prevented its release by our in_pcbrele().
*/
if (__predict_false(inp->inp_flags & ignflags)) {
inp_unlock(inp, lock);
return (false);
}
return (true);
} else {
smr_exit(inp->inp_pcbinfo->ipi_smr);
return (false);
}
}
bool
inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
{
/*
* in_pcblookup() family of functions ignore not only freed entries,
* that may be found due to lockless access to the hash, but dropped
* entries, too.
*/
return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
}
/*
* inp_next() - inpcb hash/list traversal iterator
*
* Requires initialized struct inpcb_iterator for context.
* The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
*
* - Iterator can have either write-lock or read-lock semantics, that can not
* be changed later.
* - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
* a single hash slot. Note: only rip_input() does the latter.
* - Iterator may have optional bool matching function. The matching function
* will be executed for each inpcb in the SMR context, so it can not acquire
* locks and can safely access only immutable fields of inpcb.
*
* A fresh initialized iterator has NULL inpcb in its context and that
* means that inp_next() call would return the very first inpcb on the list
* locked with desired semantic. In all following calls the context pointer
* shall hold the current inpcb pointer. The KPI user is not supposed to
* unlock the current inpcb! Upon end of traversal inp_next() will return NULL
* and write NULL to its context. After end of traversal an iterator can be
* reused.
*
* List traversals have the following features/constraints:
* - New entries won't be seen, as they are always added to the head of a list.
* - Removed entries won't stop traversal as long as they are not added to
* a different list. This is violated by in_pcbrehash().
*/
#define II_LIST_FIRST(ipi, hash) \
(((hash) == INP_ALL_LIST) ? \
CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
#define II_LIST_NEXT(inp, hash) \
(((hash) == INP_ALL_LIST) ? \
CK_LIST_NEXT((inp), inp_list) : \
CK_LIST_NEXT((inp), inp_hash_exact))
#define II_LOCK_ASSERT(inp, lock) \
rw_assert(&(inp)->inp_lock, \
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
struct inpcb *
inp_next(struct inpcb_iterator *ii)
{
const struct inpcbinfo *ipi = ii->ipi;
inp_match_t *match = ii->match;
void *ctx = ii->ctx;
inp_lookup_t lock = ii->lock;
int hash = ii->hash;
struct inpcb *inp;
if (ii->inp == NULL) { /* First call. */
smr_enter(ipi->ipi_smr);
/* This is unrolled CK_LIST_FOREACH(). */
for (inp = II_LIST_FIRST(ipi, hash);
inp != NULL;
inp = II_LIST_NEXT(inp, hash)) {
if (match != NULL && (match)(inp, ctx) == false)
continue;
if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
break;
else {
smr_enter(ipi->ipi_smr);
MPASS(inp != II_LIST_FIRST(ipi, hash));
inp = II_LIST_FIRST(ipi, hash);
if (inp == NULL)
break;
}
}
if (inp == NULL)
smr_exit(ipi->ipi_smr);
else
ii->inp = inp;
return (inp);
}
/* Not a first call. */
smr_enter(ipi->ipi_smr);
restart:
inp = ii->inp;
II_LOCK_ASSERT(inp, lock);
next:
inp = II_LIST_NEXT(inp, hash);
if (inp == NULL) {
smr_exit(ipi->ipi_smr);
goto found;
}
if (match != NULL && (match)(inp, ctx) == false)
goto next;
if (__predict_true(inp_trylock(inp, lock))) {
if (__predict_false(inp->inp_flags & INP_FREED)) {
/*
* Entries are never inserted in middle of a list, thus
* as long as we are in SMR, we can continue traversal.
* Jump to 'restart' should yield in the same result,
* but could produce unnecessary looping. Could this
* looping be unbound?
*/
inp_unlock(inp, lock);
goto next;
} else {
smr_exit(ipi->ipi_smr);
goto found;
}
}
/*
* Can't obtain lock immediately, thus going hard. Once we exit the
* SMR section we can no longer jump to 'next', and our only stable
* anchoring point is ii->inp, which we keep locked for this case, so
* we jump to 'restart'.
*/
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
smr_exit(ipi->ipi_smr);
inp_lock(inp, lock);
if (__predict_false(in_pcbrele(inp, lock))) {
smr_enter(ipi->ipi_smr);
goto restart;
}
/*
* See comment in inp_smr_lock().
*/
if (__predict_false(inp->inp_flags & INP_FREED)) {
inp_unlock(inp, lock);
smr_enter(ipi->ipi_smr);
goto restart;
}
} else
goto next;
found:
inp_unlock(ii->inp, lock);
ii->inp = inp;
return (ii->inp);
}
/*
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released or
* SMR section exited.
*
* To free a reference later in_pcbrele_(r|w)locked() must be performed.
*/
void
in_pcbref(struct inpcb *inp)
{
u_int old __diagused;
old = refcount_acquire(&inp->inp_refcount);
KASSERT(old > 0, ("%s: refcount 0", __func__));
}
/*
* Drop a refcount on an inpcb elevated using in_pcbref(), potentially
* freeing the pcb, if the reference was very last.
*/
bool
in_pcbrele_rlocked(struct inpcb *inp)
{
INP_RLOCK_ASSERT(inp);
if (!refcount_release(&inp->inp_refcount))
return (false);
MPASS(inp->inp_flags & INP_FREED);
MPASS(inp->inp_socket == NULL);
crfree(inp->inp_cred);
#ifdef INVARIANTS
inp->inp_cred = NULL;
#endif
INP_RUNLOCK(inp);
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
return (true);
}
bool
in_pcbrele_wlocked(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
if (!refcount_release(&inp->inp_refcount))
return (false);
MPASS(inp->inp_flags & INP_FREED);
MPASS(inp->inp_socket == NULL);
crfree(inp->inp_cred);
#ifdef INVARIANTS
inp->inp_cred = NULL;
#endif
INP_WUNLOCK(inp);
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
return (true);
}
bool
in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
{
return (lock == INPLOOKUP_RLOCKPCB ?
in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
}
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
* released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
* Almost all work, including removal from global lists, is done in this
* context, where the pcbinfo lock is held.
*/
void
in_pcbfree(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
#ifdef INET
struct ip_moptions *imo;
#endif
#ifdef INET6
struct ip6_moptions *im6o;
#endif
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
KASSERT((inp->inp_flags & INP_FREED) == 0,
("%s: called twice for pcb %p", __func__, inp));
/*
* in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
* from the hash without acquiring inpcb lock, they rely on the hash
* lock, thus in_pcbremhash() should be the first action.
*/
if (inp->inp_flags & INP_INHASHLIST)
in_pcbremhash(inp);
INP_INFO_WLOCK(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
pcbinfo->ipi_count--;
CK_LIST_REMOVE(inp, inp_list);
INP_INFO_WUNLOCK(pcbinfo);
#ifdef RATELIMIT
if (inp->inp_snd_tag != NULL)
in_pcbdetach_txrtlmt(inp);
#endif
inp->inp_flags |= INP_FREED;
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
RO_INVALIDATE_CACHE(&inp->inp_route);
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
#endif
#ifdef INET
if (inp->inp_options)
(void)m_free(inp->inp_options);
DEBUG_POISON_POINTER(inp->inp_options);
imo = inp->inp_moptions;
DEBUG_POISON_POINTER(inp->inp_moptions);
#endif
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
DEBUG_POISON_POINTER(inp->in6p_outputopts);
im6o = inp->in6p_moptions;
DEBUG_POISON_POINTER(inp->in6p_moptions);
} else
im6o = NULL;
#endif
if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
INP_WUNLOCK(inp);
}
#ifdef INET6
ip6_freemoptions(im6o);
#endif
#ifdef INET
inp_freemoptions(imo);
#endif
}
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
*/
static void
inpcb_fini(void *mem, int size)
{
struct inpcb *inp = mem;
INP_LOCK_DESTROY(inp);
}
/*
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
* port reservation, and preventing it from being returned by inpcb lookups.
*
* It is used by TCP to mark an inpcb as unused and avoid future packet
* delivery or event notification when a socket remains open but TCP has
* closed. This might occur as a result of a shutdown()-initiated TCP close
* or a RST on the wire, and allows the port binding to be reused while still
* maintaining the invariant that so_pcb always points to a valid inpcb until
* in_pcbdetach().
*
* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
* in_pcbpurgeif0()?
*/
void
in_pcbdrop(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
inp->inp_flags |= INP_DROPPED;
if (inp->inp_flags & INP_INHASHLIST)
in_pcbremhash(inp);
}
#ifdef INET
/*
* Common routines to return the socket addresses associated with inpcbs.
*/
int
in_getsockaddr(struct socket *so, struct sockaddr *sa)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
.sin_len = sizeof(struct sockaddr_in),
.sin_family = AF_INET,
.sin_port = inp->inp_lport,
.sin_addr = inp->inp_laddr,
};
return (0);
}
int
in_getpeeraddr(struct socket *so, struct sockaddr *sa)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
*(struct sockaddr_in *)sa = (struct sockaddr_in ){
.sin_len = sizeof(struct sockaddr_in),
.sin_family = AF_INET,
.sin_port = inp->inp_fport,
.sin_addr = inp->inp_faddr,
};
return (0);
}
static bool
inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
{
if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
return (true);
else
return (false);
}
void
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
inp_v4_multi_match, NULL);
struct inpcb *inp;
struct in_multi *inm;
struct in_mfilter *imf;
struct ip_moptions *imo;
IN_MULTI_LOCK_ASSERT();
while ((inp = inp_next(&inpi)) != NULL) {
INP_WLOCK_ASSERT(inp);
imo = inp->inp_moptions;
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (imo->imo_multicast_ifp == ifp)
imo->imo_multicast_ifp = NULL;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
*
* XXX This can all be deferred to an epoch_call
*/
restart:
IP_MFILTER_FOREACH(imf, &imo->imo_head) {
if ((inm = imf->imf_inm) == NULL)
continue;
if (inm->inm_ifp != ifp)
continue;
ip_mfilter_remove(&imo->imo_head, imf);
in_leavegroup_locked(inm, NULL);
ip_mfilter_free(imf);
goto restart;
}
}
}
/*
* Lookup a PCB based on the local address and port. Caller must hold the
* hash lock. No inpcb locks or references are acquired.
*/
#define INP_LOOKUP_MAPPED_PCB_COST 3
struct inpcb *
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
u_short lport, int fib, int lookupflags, struct ucred *cred)
{
struct inpcb *inp;
#ifdef INET6
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
#else
int matchwild = 3;
#endif
int wildcard;
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
("%s: invalid fib %d", __func__, fib));
INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
struct inpcbhead *head;
/*
* Look for an unconnected (wildcard foreign addr) PCB that
* matches the local address and port we're looking for.
*/
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr == INADDR_ANY &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
inp->inp_inc.inc_fibnum == fib)) {
/*
* Found?
*/
if (prison_equal_ip4(cred->cr_prison,
inp->inp_cred->cr_prison))
return (inp);
}
}
/*
* Not found.
*/
return (NULL);
} else {
- struct inpcbporthead *porthash;
- struct inpcbport *phd;
+ struct inpcbhead *porthash;
struct inpcb *match = NULL;
+
/*
- * Best fit PCB lookup.
- *
- * First see if this local port is in use by looking on the
- * port hash list.
+ * Port is in use by one or more PCBs. Look for best
+ * fit.
*/
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
pcbinfo->ipi_porthashmask)];
- CK_LIST_FOREACH(phd, porthash, phd_hash) {
- if (phd->phd_port == lport)
- break;
- }
- if (phd != NULL) {
+ CK_LIST_FOREACH(inp, porthash, inp_portlist) {
+ if (inp->inp_lport != lport)
+ continue;
+ if (!prison_equal_ip4(inp->inp_cred->cr_prison,
+ cred->cr_prison))
+ continue;
+ if (fib != RT_ALL_FIBS &&
+ inp->inp_inc.inc_fibnum != fib)
+ continue;
+ wildcard = 0;
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
/*
- * Port is in use by one or more PCBs. Look for best
- * fit.
+ * We never select the PCB that has INP_IPV6 flag and
+ * is bound to :: if we have another PCB which is bound
+ * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we
+ * set its cost higher than IPv4 only PCBs.
+ *
+ * Note that the case only happens when a socket is
+ * bound to ::, under the condition that the use of the
+ * mapped address is allowed.
*/
- CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
- wildcard = 0;
- if (!prison_equal_ip4(inp->inp_cred->cr_prison,
- cred->cr_prison))
- continue;
- if (fib != RT_ALL_FIBS &&
- inp->inp_inc.inc_fibnum != fib)
- continue;
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
- /*
- * We never select the PCB that has
- * INP_IPV6 flag and is bound to :: if
- * we have another PCB which is bound
- * to 0.0.0.0. If a PCB has the
- * INP_IPV6 flag, then we set its cost
- * higher than IPv4 only PCBs.
- *
- * Note that the case only happens
- * when a socket is bound to ::, under
- * the condition that the use of the
- * mapped address is allowed.
- */
- if ((inp->inp_vflag & INP_IPV6) != 0)
- wildcard += INP_LOOKUP_MAPPED_PCB_COST;
+ if ((inp->inp_vflag & INP_IPV6) != 0)
+ wildcard += INP_LOOKUP_MAPPED_PCB_COST;
#endif
- if (inp->inp_faddr.s_addr != INADDR_ANY)
+ if (inp->inp_faddr.s_addr != INADDR_ANY)
+ wildcard++;
+ if (inp->inp_laddr.s_addr != INADDR_ANY) {
+ if (laddr.s_addr == INADDR_ANY)
+ wildcard++;
+ else if (inp->inp_laddr.s_addr != laddr.s_addr)
+ continue;
+ } else {
+ if (laddr.s_addr != INADDR_ANY)
wildcard++;
- if (inp->inp_laddr.s_addr != INADDR_ANY) {
- if (laddr.s_addr == INADDR_ANY)
- wildcard++;
- else if (inp->inp_laddr.s_addr != laddr.s_addr)
- continue;
- } else {
- if (laddr.s_addr != INADDR_ANY)
- wildcard++;
- }
- if (wildcard < matchwild) {
- match = inp;
- matchwild = wildcard;
- if (matchwild == 0)
- break;
- }
+ }
+ if (wildcard < matchwild) {
+ match = inp;
+ matchwild = wildcard;
+ if (matchwild == 0)
+ break;
}
}
return (match);
}
}
#undef INP_LOOKUP_MAPPED_PCB_COST
static bool
in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
{
return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
(fib == RT_ALL_FIBS || fib == grp->il_fibnum));
}
static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
uint16_t lport, int domain, int fib)
{
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
struct inpcb *inp;
u_int count;
INP_HASH_LOCK_ASSERT(pcbinfo);
NET_EPOCH_ASSERT();
hdr = &pcbinfo->ipi_lbgrouphashbase[
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
/*
* Search for an LB group match based on the following criteria:
* - prefer jailed groups to non-jailed groups
* - prefer exact source address matches to wildcard matches
* - prefer groups bound to the specified NUMA domain
*/
jail_exact = jail_wild = local_exact = local_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
bool injail;
#ifdef INET6
if (!(grp->il_vflag & INP_IPV4))
continue;
#endif
if (grp->il_lport != lport)
continue;
injail = prison_flag(grp->il_cred, PR_IP4) != 0;
if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
laddr) != 0)
continue;
if (grp->il_laddr.s_addr == laddr->s_addr) {
if (injail) {
jail_exact = grp;
if (in_pcblookup_lb_match(grp, domain, fib))
/* This is a perfect match. */
goto out;
} else if (local_exact == NULL ||
in_pcblookup_lb_match(grp, domain, fib)) {
local_exact = grp;
}
} else if (grp->il_laddr.s_addr == INADDR_ANY) {
if (injail) {
if (jail_wild == NULL ||
in_pcblookup_lb_match(grp, domain, fib))
jail_wild = grp;
} else if (local_wild == NULL ||
in_pcblookup_lb_match(grp, domain, fib)) {
local_wild = grp;
}
}
}
if (jail_exact != NULL)
grp = jail_exact;
else if (jail_wild != NULL)
grp = jail_wild;
else if (local_exact != NULL)
grp = local_exact;
else
grp = local_wild;
if (grp == NULL)
return (NULL);
out:
/*
* Synchronize with in_pcblbgroup_insert().
*/
count = atomic_load_acq_int(&grp->il_inpcnt);
if (count == 0)
return (NULL);
inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
return (inp);
}
static bool
in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
u_short fport, struct in_addr laddr, u_short lport)
{
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
return (false);
#endif
if (inp->inp_faddr.s_addr == faddr.s_addr &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_fport == fport &&
inp->inp_lport == lport)
return (true);
return (false);
}
static struct inpcb *
in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_short fport, struct in_addr laddr, u_short lport)
{
struct inpcbhead *head;
struct inpcb *inp;
INP_HASH_LOCK_ASSERT(pcbinfo);
head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash_exact) {
if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
return (inp);
}
return (NULL);
}
typedef enum {
INPLOOKUP_MATCH_NONE = 0,
INPLOOKUP_MATCH_WILD = 1,
INPLOOKUP_MATCH_LADDR = 2,
} inp_lookup_match_t;
static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
u_short lport, int fib)
{
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
return (INPLOOKUP_MATCH_NONE);
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
return (INPLOOKUP_MATCH_NONE);
if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
return (INPLOOKUP_MATCH_NONE);
if (inp->inp_laddr.s_addr == INADDR_ANY)
return (INPLOOKUP_MATCH_WILD);
if (inp->inp_laddr.s_addr == laddr.s_addr)
return (INPLOOKUP_MATCH_LADDR);
return (INPLOOKUP_MATCH_NONE);
}
#define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
static struct inpcb *
in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
u_short lport, int fib, const inp_lookup_t lockflags)
{
struct inpcbhead *head;
struct inpcb *inp;
KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
("%s: not in SMR read section", __func__));
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
inp_lookup_match_t match;
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
if (match == INPLOOKUP_MATCH_NONE)
continue;
if (__predict_true(inp_smr_lock(inp, lockflags))) {
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
if (match != INPLOOKUP_MATCH_NONE &&
prison_check_ip4_locked(inp->inp_cred->cr_prison,
&laddr) == 0)
return (inp);
inp_unlock(inp, lockflags);
}
/*
* The matching socket disappeared out from under us. Fall back
* to a serialized lookup.
*/
return (INP_LOOKUP_AGAIN);
}
return (NULL);
}
static struct inpcb *
in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
u_short lport, int fib)
{
struct inpcbhead *head;
struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
#ifdef INET6
struct inpcb *local_wild_mapped;
#endif
INP_HASH_LOCK_ASSERT(pcbinfo);
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
* 2. jailed, wild.
* 3. non-jailed, non-wild.
* 4. non-jailed, wild.
*/
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
pcbinfo->ipi_hashmask)];
local_wild = local_exact = jail_wild = NULL;
#ifdef INET6
local_wild_mapped = NULL;
#endif
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
inp_lookup_match_t match;
bool injail;
match = in_pcblookup_wild_match(inp, laddr, lport, fib);
if (match == INPLOOKUP_MATCH_NONE)
continue;
injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
if (injail) {
if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
&laddr) != 0)
continue;
} else {
if (local_exact != NULL)
continue;
}
if (match == INPLOOKUP_MATCH_LADDR) {
if (injail)
return (inp);
local_exact = inp;
} else {
#ifdef INET6
/* XXX inp locking, NULL check */
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
#endif
if (injail)
jail_wild = inp;
else
local_wild = inp;
}
}
if (jail_wild != NULL)
return (jail_wild);
if (local_exact != NULL)
return (local_exact);
if (local_wild != NULL)
return (local_wild);
#ifdef INET6
if (local_wild_mapped != NULL)
return (local_wild_mapped);
#endif
return (NULL);
}
/*
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
* that the caller has either locked the hash list, which usually happens
* for bind(2) operations, or is in SMR section, which happens when sorting
* out incoming packets.
*/
static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
uint8_t numa_domain, int fib)
{
struct inpcb *inp;
const u_short fport = fport_arg, lport = lport_arg;
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT(faddr.s_addr != INADDR_ANY,
("%s: invalid foreign address", __func__));
KASSERT(laddr.s_addr != INADDR_ANY,
("%s: invalid local address", __func__));
INP_HASH_WLOCK_ASSERT(pcbinfo);
inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
if (inp != NULL)
return (inp);
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
&laddr, lport, numa_domain, fib);
if (inp == NULL) {
inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
lport, fib);
}
}
return (inp);
}
static struct inpcb *
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
uint8_t numa_domain, int fib)
{
struct inpcb *inp;
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
INP_HASH_WLOCK(pcbinfo);
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
if (inp != NULL && !inp_trylock(inp, lockflags)) {
in_pcbref(inp);
INP_HASH_WUNLOCK(pcbinfo);
inp_lock(inp, lockflags);
if (in_pcbrele(inp, lockflags))
/* XXX-MJ or retry until we get a negative match? */
inp = NULL;
} else {
INP_HASH_WUNLOCK(pcbinfo);
}
return (inp);
}
static struct inpcb *
in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
uint8_t numa_domain, int fib)
{
struct inpcb *inp;
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
const u_short fport = fport_arg, lport = lport_arg;
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
smr_enter(pcbinfo->ipi_smr);
inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
if (inp != NULL) {
if (__predict_true(inp_smr_lock(inp, lockflags))) {
/*
* Revalidate the 4-tuple, the socket could have been
* disconnected.
*/
if (__predict_true(in_pcblookup_exact_match(inp,
faddr, fport, laddr, lport)))
return (inp);
inp_unlock(inp, lockflags);
}
/*
* We failed to lock the inpcb, or its connection state changed
* out from under us. Fall back to a precise search.
*/
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
lookupflags, numa_domain, fib));
}
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
&laddr, lport, numa_domain, fib);
if (inp != NULL) {
if (__predict_true(inp_smr_lock(inp, lockflags))) {
if (__predict_true(in_pcblookup_wild_match(inp,
laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
return (inp);
inp_unlock(inp, lockflags);
}
inp = INP_LOOKUP_AGAIN;
} else {
inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
fib, lockflags);
}
if (inp == INP_LOOKUP_AGAIN) {
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
lport, lookupflags, numa_domain, fib));
}
}
if (inp == NULL)
smr_exit(pcbinfo->ipi_smr);
return (inp);
}
/*
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
* from which a pre-calculated hash value may be extracted.
*/
struct inpcb *
in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
struct in_addr laddr, u_int lport, int lookupflags,
struct ifnet *ifp)
{
int fib;
fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
lookupflags, M_NODOM, fib));
}
struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
struct ifnet *ifp __unused, struct mbuf *m)
{
int fib;
M_ASSERTPKTHDR(m);
fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
lookupflags, m->m_pkthdr.numa_domain, fib));
}
#endif /* INET */
static bool
in_pcbjailed(const struct inpcb *inp, unsigned int flag)
{
return (prison_flag(inp->inp_cred, flag) != 0);
}
/*
* Insert the PCB into a hash chain using ordering rules which ensure that
* in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
*
* Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
* with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs
* always appear last no matter whether they are jailed.
*/
static void
_in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
{
struct inpcb *last;
bool bound, injail;
INP_LOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
last = NULL;
bound = inp->inp_laddr.s_addr != INADDR_ANY;
if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
return;
}
}
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
return;
}
injail = in_pcbjailed(inp, PR_IP4);
if (!injail) {
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
if (!in_pcbjailed(last, PR_IP4))
break;
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
return;
}
}
} else if (!CK_LIST_EMPTY(pcbhash) &&
!in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
return;
}
if (!bound) {
CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
if (last->inp_laddr.s_addr == INADDR_ANY)
break;
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
return;
}
}
}
if (last == NULL)
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
else
CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
}
#ifdef INET6
/*
* See the comment above _in_pcbinshash_wild().
*/
static void
_in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
{
struct inpcb *last;
bool bound, injail;
INP_LOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
last = NULL;
bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
injail = in_pcbjailed(inp, PR_IP6);
if (!injail) {
CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
if (!in_pcbjailed(last, PR_IP6))
break;
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
return;
}
}
} else if (!CK_LIST_EMPTY(pcbhash) &&
!in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
return;
}
if (!bound) {
CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
break;
if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
return;
}
}
}
if (last == NULL)
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
else
CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
}
#endif
/*
* Insert PCB onto various hash lists.
*/
int
in_pcbinshash(struct inpcb *inp)
{
- struct inpcbhead *pcbhash;
- struct inpcbporthead *pcbporthash;
+ struct inpcbhead *pcbhash, *pcbporthash;
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- struct inpcbport *phd;
uint32_t hash;
bool connected;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
("in_pcbinshash: INP_INHASHLIST"));
#ifdef INET6
if (inp->inp_vflag & INP_IPV6) {
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
inp->inp_fport, pcbinfo->ipi_hashmask);
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
} else
#endif
{
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
inp->inp_fport, pcbinfo->ipi_hashmask);
connected = !in_nullhost(inp->inp_faddr);
}
if (connected)
pcbhash = &pcbinfo->ipi_hash_exact[hash];
else
pcbhash = &pcbinfo->ipi_hash_wild[hash];
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
/*
* Add entry to load balance group.
* Only do this if SO_REUSEPORT_LB is set.
*/
if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
int error = in_pcbinslbgrouphash(inp, M_NODOM);
if (error != 0)
return (error);
}
- /*
- * Go through port list and look for a head for this lport.
- */
- CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
- if (phd->phd_port == inp->inp_lport)
- break;
- }
-
- /*
- * If none exists, malloc one and tack it on.
- */
- if (phd == NULL) {
- phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
- if (phd == NULL) {
- if ((inp->inp_flags & INP_INLBGROUP) != 0)
- in_pcbremlbgrouphash(inp);
- return (ENOMEM);
- }
- phd->phd_port = inp->inp_lport;
- CK_LIST_INIT(&phd->phd_pcblist);
- CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
- }
- inp->inp_phd = phd;
- CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
-
/*
* The PCB may have been disconnected in the past. Before we can safely
* make it visible in the hash table, we must wait for all readers which
* may be traversing this PCB to finish.
*/
if (inp->inp_smr != SMR_SEQ_INVALID) {
smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
inp->inp_smr = SMR_SEQ_INVALID;
}
if (connected)
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
else {
#ifdef INET6
if ((inp->inp_vflag & INP_IPV6) != 0)
_in6_pcbinshash_wild(pcbhash, inp);
else
#endif
_in_pcbinshash_wild(pcbhash, inp);
}
+ CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
inp->inp_flags |= INP_INHASHLIST;
return (0);
}
void
in_pcbremhash_locked(struct inpcb *inp)
{
- struct inpcbport *phd = inp->inp_phd;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
MPASS(inp->inp_flags & INP_INHASHLIST);
if ((inp->inp_flags & INP_INLBGROUP) != 0)
in_pcbremlbgrouphash(inp);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6) {
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
CK_LIST_REMOVE(inp, inp_hash_wild);
else
CK_LIST_REMOVE(inp, inp_hash_exact);
} else
#endif
{
if (in_nullhost(inp->inp_faddr))
CK_LIST_REMOVE(inp, inp_hash_wild);
else
CK_LIST_REMOVE(inp, inp_hash_exact);
}
CK_LIST_REMOVE(inp, inp_portlist);
- if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
- CK_LIST_REMOVE(phd, phd_hash);
- uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
- }
inp->inp_flags &= ~INP_INHASHLIST;
}
static void
in_pcbremhash(struct inpcb *inp)
{
INP_HASH_WLOCK(inp->inp_pcbinfo);
in_pcbremhash_locked(inp);
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
}
/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
* hashed port list would have to be updated as well), so the lport must
* not change after in_pcbinshash() has been called.
*/
void
in_pcbrehash(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbhead *head;
uint32_t hash;
bool connected;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
KASSERT(inp->inp_flags & INP_INHASHLIST,
("%s: !INP_INHASHLIST", __func__));
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp was disconnected", __func__));
#ifdef INET6
if (inp->inp_vflag & INP_IPV6) {
hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
inp->inp_fport, pcbinfo->ipi_hashmask);
connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
} else
#endif
{
hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
inp->inp_fport, pcbinfo->ipi_hashmask);
connected = !in_nullhost(inp->inp_faddr);
}
/*
* When rehashing, the caller must ensure that either the new or the old
* foreign address was unspecified.
*/
if (connected)
CK_LIST_REMOVE(inp, inp_hash_wild);
else
CK_LIST_REMOVE(inp, inp_hash_exact);
if (connected) {
head = &pcbinfo->ipi_hash_exact[hash];
CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
} else {
head = &pcbinfo->ipi_hash_wild[hash];
CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
}
}
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
* routing information. If the route was created dynamically
* (by a redirect), time to try a default gateway again.
*/
void
in_losing(struct inpcb *inp)
{
RO_INVALIDATE_CACHE(&inp->inp_route);
return;
}
/*
* A set label operation has occurred at the socket layer, propagate the
* label change into the in_pcb for the socket.
*/
void
in_pcbsosetlabel(struct socket *so)
{
#ifdef MAC
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
INP_WLOCK(inp);
SOCK_LOCK(so);
mac_inpcb_sosetlabel(so, inp);
SOCK_UNLOCK(so);
INP_WUNLOCK(inp);
#endif
}
void
inp_wlock(struct inpcb *inp)
{
INP_WLOCK(inp);
}
void
inp_wunlock(struct inpcb *inp)
{
INP_WUNLOCK(inp);
}
void
inp_rlock(struct inpcb *inp)
{
INP_RLOCK(inp);
}
void
inp_runlock(struct inpcb *inp)
{
INP_RUNLOCK(inp);
}
#ifdef INVARIANT_SUPPORT
void
inp_lock_assert(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
}
void
inp_unlock_assert(struct inpcb *inp)
{
INP_UNLOCK_ASSERT(inp);
}
#endif
void
inp_apply_all(struct inpcbinfo *pcbinfo,
void (*func)(struct inpcb *, void *), void *arg)
{
struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
INPLOOKUP_WLOCKPCB);
struct inpcb *inp;
while ((inp = inp_next(&inpi)) != NULL)
func(inp, arg);
}
struct socket *
inp_inpcbtosocket(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
return (inp->inp_socket);
}
void
inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp)
{
INP_LOCK_ASSERT(inp);
*laddr = inp->inp_laddr.s_addr;
*faddr = inp->inp_faddr.s_addr;
*lp = inp->inp_lport;
*fp = inp->inp_fport;
}
/*
* Create an external-format (``xinpcb'') structure using the information in
* the kernel-format in_pcb structure pointed to by inp. This is done to
* reduce the spew of irrelevant information over this interface, to isolate
* user code from changes in the kernel structure, and potentially to provide
* information-hiding if we decide that some of this information should be
* hidden from users.
*/
void
in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
{
bzero(xi, sizeof(*xi));
xi->xi_len = sizeof(struct xinpcb);
if (inp->inp_socket)
sotoxsocket(inp->inp_socket, &xi->xi_socket);
bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
xi->inp_gencnt = inp->inp_gencnt;
xi->inp_flow = inp->inp_flow;
xi->inp_flowid = inp->inp_flowid;
xi->inp_flowtype = inp->inp_flowtype;
xi->inp_flags = inp->inp_flags;
xi->inp_flags2 = inp->inp_flags2;
xi->in6p_cksum = inp->in6p_cksum;
xi->in6p_hops = inp->in6p_hops;
xi->inp_ip_tos = inp->inp_ip_tos;
xi->inp_vflag = inp->inp_vflag;
xi->inp_ip_ttl = inp->inp_ip_ttl;
xi->inp_ip_p = inp->inp_ip_p;
xi->inp_ip_minttl = inp->inp_ip_minttl;
}
int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
int (*ctloutput_set)(struct inpcb *, struct sockopt *))
{
struct sockopt sopt;
struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
INPLOOKUP_WLOCKPCB);
struct inpcb *inp;
struct sockopt_parameters *params;
struct socket *so;
int error;
char buf[1024];
if (req->oldptr != NULL || req->oldlen != 0)
return (EINVAL);
if (req->newptr == NULL)
return (EPERM);
if (req->newlen > sizeof(buf))
return (ENOMEM);
error = SYSCTL_IN(req, buf, req->newlen);
if (error != 0)
return (error);
if (req->newlen < sizeof(struct sockopt_parameters))
return (EINVAL);
params = (struct sockopt_parameters *)buf;
sopt.sopt_level = params->sop_level;
sopt.sopt_name = params->sop_optname;
sopt.sopt_dir = SOPT_SET;
sopt.sopt_val = params->sop_optval;
sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
sopt.sopt_td = NULL;
#ifdef INET6
if (params->sop_inc.inc_flags & INC_ISIPV6) {
if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr))
params->sop_inc.inc6_laddr.s6_addr16[1] =
htons(params->sop_inc.inc6_zoneid & 0xffff);
if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr))
params->sop_inc.inc6_faddr.s6_addr16[1] =
htons(params->sop_inc.inc6_zoneid & 0xffff);
}
#endif
if (params->sop_inc.inc_lport != htons(0) &&
params->sop_inc.inc_fport != htons(0)) {
#ifdef INET6
if (params->sop_inc.inc_flags & INC_ISIPV6)
inpi.hash = INP6_PCBHASH(
¶ms->sop_inc.inc6_faddr,
params->sop_inc.inc_lport,
params->sop_inc.inc_fport,
pcbinfo->ipi_hashmask);
else
#endif
inpi.hash = INP_PCBHASH(
¶ms->sop_inc.inc_faddr,
params->sop_inc.inc_lport,
params->sop_inc.inc_fport,
pcbinfo->ipi_hashmask);
}
while ((inp = inp_next(&inpi)) != NULL)
if (inp->inp_gencnt == params->sop_id) {
if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
return (ECONNRESET);
}
so = inp->inp_socket;
KASSERT(so != NULL, ("inp_socket == NULL"));
soref(so);
if (params->sop_level == SOL_SOCKET) {
INP_WUNLOCK(inp);
error = sosetopt(so, &sopt);
} else
error = (*ctloutput_set)(inp, &sopt);
sorele(so);
break;
}
if (inp == NULL)
error = ESRCH;
return (error);
}
#ifdef DDB
static void
db_print_indent(int indent)
{
int i;
for (i = 0; i < indent; i++)
db_printf(" ");
}
static void
db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
{
char faddr_str[48], laddr_str[48];
db_print_indent(indent);
db_printf("%s at %p\n", name, inc);
indent += 2;
#ifdef INET6
if (inc->inc_flags & INC_ISIPV6) {
/* IPv6. */
ip6_sprintf(laddr_str, &inc->inc6_laddr);
ip6_sprintf(faddr_str, &inc->inc6_faddr);
} else
#endif
{
/* IPv4. */
inet_ntoa_r(inc->inc_laddr, laddr_str);
inet_ntoa_r(inc->inc_faddr, faddr_str);
}
db_print_indent(indent);
db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
ntohs(inc->inc_lport));
db_print_indent(indent);
db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
ntohs(inc->inc_fport));
}
static void
db_print_inpflags(int inp_flags)
{
int comma;
comma = 0;
if (inp_flags & INP_RECVOPTS) {
db_printf("%sINP_RECVOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVRETOPTS) {
db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVDSTADDR) {
db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ORIGDSTADDR) {
db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_HDRINCL) {
db_printf("%sINP_HDRINCL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_HIGHPORT) {
db_printf("%sINP_HIGHPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_LOWPORT) {
db_printf("%sINP_LOWPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ANONPORT) {
db_printf("%sINP_ANONPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVIF) {
db_printf("%sINP_RECVIF", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_MTUDISC) {
db_printf("%sINP_MTUDISC", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVTTL) {
db_printf("%sINP_RECVTTL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_DONTFRAG) {
db_printf("%sINP_DONTFRAG", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVTOS) {
db_printf("%sINP_RECVTOS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_IPV6_V6ONLY) {
db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_PKTINFO) {
db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_HOPLIMIT) {
db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_HOPOPTS) {
db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_DSTOPTS) {
db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RTHDR) {
db_printf("%sIN6P_RTHDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RTHDRDSTOPTS) {
db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_TCLASS) {
db_printf("%sIN6P_TCLASS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_AUTOFLOWLABEL) {
db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ONESBCAST) {
db_printf("%sINP_ONESBCAST", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_DROPPED) {
db_printf("%sINP_DROPPED", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_SOCKREF) {
db_printf("%sINP_SOCKREF", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RFC2292) {
db_printf("%sIN6P_RFC2292", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_MTU) {
db_printf("IN6P_MTU%s", comma ? ", " : "");
comma = 1;
}
}
static void
db_print_inpvflag(u_char inp_vflag)
{
int comma;
comma = 0;
if (inp_vflag & INP_IPV4) {
db_printf("%sINP_IPV4", comma ? ", " : "");
comma = 1;
}
if (inp_vflag & INP_IPV6) {
db_printf("%sINP_IPV6", comma ? ", " : "");
comma = 1;
}
if (inp_vflag & INP_IPV6PROTO) {
db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
comma = 1;
}
}
static void
db_print_inpcb(struct inpcb *inp, const char *name, int indent)
{
db_print_indent(indent);
db_printf("%s at %p\n", name, inp);
indent += 2;
db_print_indent(indent);
db_printf("inp_flow: 0x%x\n", inp->inp_flow);
db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
db_print_indent(indent);
db_printf("inp_label: %p inp_flags: 0x%x (",
inp->inp_label, inp->inp_flags);
db_print_inpflags(inp->inp_flags);
db_printf(")\n");
db_print_indent(indent);
db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
inp->inp_vflag);
db_print_inpvflag(inp->inp_vflag);
db_printf(")\n");
db_print_indent(indent);
db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
db_print_indent(indent);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6) {
db_printf("in6p_options: %p in6p_outputopts: %p "
"in6p_moptions: %p\n", inp->in6p_options,
inp->in6p_outputopts, inp->in6p_moptions);
db_printf("in6p_icmp6filt: %p in6p_cksum %d "
"in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
inp->in6p_hops);
} else
#endif
{
db_printf("inp_ip_tos: %d inp_ip_options: %p "
"inp_ip_moptions: %p\n", inp->inp_ip_tos,
inp->inp_options, inp->inp_moptions);
}
db_print_indent(indent);
- db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
- (uintmax_t)inp->inp_gencnt);
+ db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
}
DB_SHOW_COMMAND(inpcb, db_show_inpcb)
{
struct inpcb *inp;
if (!have_addr) {
db_printf("usage: show inpcb <addr>\n");
return;
}
inp = (struct inpcb *)addr;
db_print_inpcb(inp, "inpcb", 0);
}
#endif /* DDB */
#ifdef RATELIMIT
/*
* Modify TX rate limit based on the existing "inp->inp_snd_tag",
* if any.
*/
int
in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
{
union if_snd_tag_modify_params params = {
.rate_limit.max_rate = max_pacing_rate,
.rate_limit.flags = M_NOWAIT,
};
struct m_snd_tag *mst;
int error;
mst = inp->inp_snd_tag;
if (mst == NULL)
return (EINVAL);
if (mst->sw->snd_tag_modify == NULL) {
error = EOPNOTSUPP;
} else {
error = mst->sw->snd_tag_modify(mst, ¶ms);
}
return (error);
}
/*
* Query existing TX rate limit based on the existing
* "inp->inp_snd_tag", if any.
*/
int
in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
{
union if_snd_tag_query_params params = { };
struct m_snd_tag *mst;
int error;
mst = inp->inp_snd_tag;
if (mst == NULL)
return (EINVAL);
if (mst->sw->snd_tag_query == NULL) {
error = EOPNOTSUPP;
} else {
error = mst->sw->snd_tag_query(mst, ¶ms);
if (error == 0 && p_max_pacing_rate != NULL)
*p_max_pacing_rate = params.rate_limit.max_rate;
}
return (error);
}
/*
* Query existing TX queue level based on the existing
* "inp->inp_snd_tag", if any.
*/
int
in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
{
union if_snd_tag_query_params params = { };
struct m_snd_tag *mst;
int error;
mst = inp->inp_snd_tag;
if (mst == NULL)
return (EINVAL);
if (mst->sw->snd_tag_query == NULL)
return (EOPNOTSUPP);
error = mst->sw->snd_tag_query(mst, ¶ms);
if (error == 0 && p_txqueue_level != NULL)
*p_txqueue_level = params.rate_limit.queue_level;
return (error);
}
/*
* Allocate a new TX rate limit send tag from the network interface
* given by the "ifp" argument and save it in "inp->inp_snd_tag":
*/
int
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
{
union if_snd_tag_alloc_params params = {
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
.rate_limit.hdr.flowid = flowid,
.rate_limit.hdr.flowtype = flowtype,
.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
.rate_limit.max_rate = max_pacing_rate,
.rate_limit.flags = M_NOWAIT,
};
int error;
INP_WLOCK_ASSERT(inp);
/*
* If there is already a send tag, or the INP is being torn
* down, allocating a new send tag is not allowed. Else send
* tags may leak.
*/
if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
return (EINVAL);
error = m_snd_tag_alloc(ifp, ¶ms, st);
#ifdef INET
if (error == 0) {
counter_u64_add(rate_limit_set_ok, 1);
counter_u64_add(rate_limit_active, 1);
} else if (error != EOPNOTSUPP)
counter_u64_add(rate_limit_alloc_fail, 1);
#endif
return (error);
}
void
in_pcbdetach_tag(struct m_snd_tag *mst)
{
m_snd_tag_rele(mst);
#ifdef INET
counter_u64_add(rate_limit_active, -1);
#endif
}
/*
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
* if any:
*/
void
in_pcbdetach_txrtlmt(struct inpcb *inp)
{
struct m_snd_tag *mst;
INP_WLOCK_ASSERT(inp);
mst = inp->inp_snd_tag;
inp->inp_snd_tag = NULL;
if (mst == NULL)
return;
m_snd_tag_rele(mst);
#ifdef INET
counter_u64_add(rate_limit_active, -1);
#endif
}
int
in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
{
int error;
/*
* If the existing send tag is for the wrong interface due to
* a route change, first drop the existing tag. Set the
* CHANGED flag so that we will keep trying to allocate a new
* tag if we fail to allocate one this time.
*/
if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
in_pcbdetach_txrtlmt(inp);
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
}
/*
* NOTE: When attaching to a network interface a reference is
* made to ensure the network interface doesn't go away until
* all ratelimit connections are gone. The network interface
* pointers compared below represent valid network interfaces,
* except when comparing towards NULL.
*/
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
error = 0;
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
if (inp->inp_snd_tag != NULL)
in_pcbdetach_txrtlmt(inp);
error = 0;
} else if (inp->inp_snd_tag == NULL) {
/*
* In order to utilize packet pacing with RSS, we need
* to wait until there is a valid RSS hash before we
* can proceed:
*/
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
error = EAGAIN;
} else {
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
}
} else {
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
}
if (error == 0 || error == EOPNOTSUPP)
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
return (error);
}
/*
* This function should be called when the INP_RATE_LIMIT_CHANGED flag
* is set in the fast path and will attach/detach/modify the TX rate
* limit send tag based on the socket's so_max_pacing_rate value.
*/
void
in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
{
struct socket *socket;
uint32_t max_pacing_rate;
bool did_upgrade;
if (inp == NULL)
return;
socket = inp->inp_socket;
if (socket == NULL)
return;
if (!INP_WLOCKED(inp)) {
/*
* NOTE: If the write locking fails, we need to bail
* out and use the non-ratelimited ring for the
* transmit until there is a new chance to get the
* write lock.
*/
if (!INP_TRY_UPGRADE(inp))
return;
did_upgrade = 1;
} else {
did_upgrade = 0;
}
/*
* NOTE: The so_max_pacing_rate value is read unlocked,
* because atomic updates are not required since the variable
* is checked at every mbuf we send. It is assumed that the
* variable read itself will be atomic.
*/
max_pacing_rate = socket->so_max_pacing_rate;
in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
if (did_upgrade)
INP_DOWNGRADE(inp);
}
/*
* Track route changes for TX rate limiting.
*/
void
in_pcboutput_eagain(struct inpcb *inp)
{
bool did_upgrade;
if (inp == NULL)
return;
if (inp->inp_snd_tag == NULL)
return;
if (!INP_WLOCKED(inp)) {
/*
* NOTE: If the write locking fails, we need to bail
* out and use the non-ratelimited ring for the
* transmit until there is a new chance to get the
* write lock.
*/
if (!INP_TRY_UPGRADE(inp))
return;
did_upgrade = 1;
} else {
did_upgrade = 0;
}
/* detach rate limiting */
in_pcbdetach_txrtlmt(inp);
/* make sure new mbuf send tag allocation is made */
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
if (did_upgrade)
INP_DOWNGRADE(inp);
}
#ifdef INET
static void
rl_init(void *st)
{
rate_limit_new = counter_u64_alloc(M_WAITOK);
rate_limit_chg = counter_u64_alloc(M_WAITOK);
rate_limit_active = counter_u64_alloc(M_WAITOK);
rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
}
SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
#endif
#endif /* RATELIMIT */
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index c2b90de2ef54..5fe12c4f1e76 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -1,715 +1,710 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California.
* Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
* contract to Juniper Networks, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _NETINET_IN_PCB_H_
#define _NETINET_IN_PCB_H_
#include <sys/queue.h>
#include <sys/epoch.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/_rwlock.h>
#include <sys/_smr.h>
#include <net/route.h>
#ifdef _KERNEL
#include <sys/lock.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <net/vnet.h>
#include <vm/uma.h>
#endif
#include <sys/ck.h>
/*
* struct inpcb is the common protocol control block structure used in most
* IP transport protocols.
*
* Pointers to local and foreign host table entries, local and foreign socket
* numbers, and pointers up (to a socket structure) and down (to a
* protocol-specific control block) are stored here.
*/
CK_LIST_HEAD(inpcbhead, inpcb);
-CK_LIST_HEAD(inpcbporthead, inpcbport);
CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
typedef uint64_t inp_gen_t;
/*
* PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
* So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
* the following structure. This requires padding always be zeroed out,
* which is done right after inpcb allocation and stays through its lifetime.
*/
struct in_addr_4in6 {
u_int32_t ia46_pad32[3];
struct in_addr ia46_addr4;
};
union in_dependaddr {
struct in_addr_4in6 id46_addr;
struct in6_addr id6_addr;
};
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
* NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
* lport, faddr to generate hash, so these fields shouldn't be moved.
*/
struct in_endpoints {
u_int16_t ie_fport; /* foreign port */
u_int16_t ie_lport; /* local port */
/* protocol dependent part, local and foreign addr */
union in_dependaddr ie_dependfaddr; /* foreign host table entry */
union in_dependaddr ie_dependladdr; /* local host table entry */
#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4
#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4
#define ie6_faddr ie_dependfaddr.id6_addr
#define ie6_laddr ie_dependladdr.id6_addr
u_int32_t ie6_zoneid; /* scope zone id */
};
/*
* XXX The defines for inc_* are hacks and should be changed to direct
* references.
*/
struct in_conninfo {
u_int8_t inc_flags;
u_int8_t inc_len;
u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */
/* protocol dependent part */
struct in_endpoints inc_ie;
};
/*
* Flags for inc_flags.
*/
#define INC_ISIPV6 0x01
#define INC_IPV6MINMTU 0x02
#define inc_fport inc_ie.ie_fport
#define inc_lport inc_ie.ie_lport
#define inc_faddr inc_ie.ie_faddr
#define inc_laddr inc_ie.ie_laddr
#define inc6_faddr inc_ie.ie6_faddr
#define inc6_laddr inc_ie.ie6_laddr
#define inc6_zoneid inc_ie.ie6_zoneid
#if defined(_KERNEL) || defined(_WANT_INPCB)
/*
* struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
* IPv6 sockets. In the case of TCP and UDP, further per-connection state is
* located in a larger protocol specific structure that embeds inpcb in it.
* Almost all fields of struct inpcb are static after creation or protected by
* a per-inpcb rwlock, inp_lock.
*
* A inpcb database is indexed by addresses/ports hash as well as list of
* all pcbs that belong to a certain proto. Database lookups or list traversals
* are be performed inside SMR section. Once desired PCB is found its own
* lock is to be obtained and SMR section exited.
*
* Key:
* (c) - Constant after initialization
* (e) - Protected by the SMR section
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
*
* A few other notes:
*
* When a read lock is held, stability of the field is guaranteed; to write
* to a field, a write lock must generally be held.
*
* netinet/netinet6-layer code should not assume that the inp_socket pointer
* is safe to dereference without inp_lock being held, there may be
* close(2)-related races.
*
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
*/
struct icmp6_filter;
struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
union {
CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */
LIST_ENTRY(inpcb) inp_lbgroup_list; /* lb group list */
};
CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_refcount
#define inp_zero_size (sizeof(struct inpcb) - \
offsetof(struct inpcb, inp_start_zero))
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
uint8_t inp_numa_domain; /* numa domain */
struct socket *inp_socket; /* (i) back pointer to socket */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
u_char inp_vflag; /* (i) IP version flag (v4/v6) */
u_char inp_ip_ttl; /* (i) time to live proto */
u_char inp_ip_p; /* (c) protocol proto */
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
smr_seq_t inp_smr; /* (i) sequence number at disconnect */
struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
/* Local and foreign ports, local and foreign addr. */
struct in_conninfo inp_inc; /* (i,h) list for PCB's local port */
/* MAC and IPSEC policy information. */
struct label *inp_label; /* (i) MAC label */
struct inpcbpolicy *inp_sp; /* (s) for IPSEC */
/* Protocol-dependent part; options. */
struct {
u_char inp_ip_tos; /* (i) type of service proto */
struct mbuf *inp_options; /* (i) IP options */
struct ip_moptions *inp_moptions; /* (i) mcast options */
};
struct {
/* (i) IP options */
struct mbuf *in6p_options;
/* (i) IP6 options for outgoing packets */
struct ip6_pktopts *in6p_outputopts;
/* (i) IP multicast options */
struct ip6_moptions *in6p_moptions;
/* (i) ICMPv6 code type filter */
struct icmp6_filter *in6p_icmp6filt;
/* (i) IPV6_CHECKSUM setsockopt */
int in6p_cksum;
short in6p_hops;
};
CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */
- struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */
inp_gen_t inp_gencnt; /* (c) generation count */
void *spare_ptr; /* Spare pointer. */
rt_gen_t inp_rt_cookie; /* generation for route entry */
union { /* cached L3 information */
struct route inp_route;
struct route_in6 inp_route6;
};
CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */
};
#endif /* _KERNEL */
#define inp_fport inp_inc.inc_fport
#define inp_lport inp_inc.inc_lport
#define inp_faddr inp_inc.inc_faddr
#define inp_laddr inp_inc.inc_laddr
#define in6p_faddr inp_inc.inc6_faddr
#define in6p_laddr inp_inc.inc6_laddr
#define in6p_zoneid inp_inc.inc6_zoneid
#define inp_vnet inp_pcbinfo->ipi_vnet
/*
* The range of the generation count, as used in this implementation, is 9e19.
* We would have to create 300 billion connections per second for this number
* to roll over in a year. This seems sufficiently unlikely that we simply
* don't concern ourselves with that possibility.
*/
/*
* Interface exported to userland by various protocols which use inpcbs. Hack
* alert -- only define if struct xsocket is in scope.
* Fields prefixed with "xi_" are unique to this structure, and the rest
* match fields in the struct inpcb, to ease coding and porting.
*
* Legend:
* (s) - used by userland utilities in src
* (p) - used by utilities in ports
* (3) - is known to be used by third party software not in ports
* (n) - no known usage
*/
#ifdef _SYS_SOCKETVAR_H_
struct xinpcb {
ksize_t xi_len; /* length of this structure */
struct xsocket xi_socket; /* (s,p) */
struct in_conninfo inp_inc; /* (s,p) */
uint64_t inp_gencnt; /* (s,p) */
int64_t inp_spare64[5];
uint32_t inp_flow; /* (s) */
uint32_t inp_flowid; /* (s) */
uint32_t inp_flowtype; /* (s) */
int32_t inp_flags; /* (s,p) */
int32_t inp_flags2; /* (s) */
uint32_t inp_unused;
int32_t in6p_cksum; /* (n) */
int32_t inp_spare32[4];
uint16_t in6p_hops; /* (n) */
uint8_t inp_ip_tos; /* (n) */
int8_t pad8;
uint8_t inp_vflag; /* (s,p) */
uint8_t inp_ip_ttl; /* (n) */
uint8_t inp_ip_p; /* (n) */
uint8_t inp_ip_minttl; /* (n) */
int8_t inp_spare8[4];
} __aligned(8);
struct xinpgen {
ksize_t xig_len; /* length of this structure */
u_int xig_count; /* number of PCBs at this time */
uint32_t _xig_spare32;
inp_gen_t xig_gen; /* generation count at this time */
so_gen_t xig_sogen; /* socket generation count this time */
uint64_t _xig_spare64[4];
} __aligned(8);
struct sockopt_parameters {
struct in_conninfo sop_inc;
uint64_t sop_id;
int sop_level;
int sop_optname;
char sop_optval[];
};
#ifdef _KERNEL
int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
int (*ctloutput_set)(struct inpcb *, struct sockopt *));
void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
#endif
#endif /* _SYS_SOCKETVAR_H_ */
#ifdef _KERNEL
/*
* Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6.
*
* The pcbs are protected with SMR section and thus all lists in inpcbinfo
* are CK-lists. Locking is required to insert a pcb into database. Two
* locks are provided: one for the hash and one for the global list of pcbs,
* as well as overall count and generation count.
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
* (e) Protected by SMR section
* (g) Locked by ipi_lock
* (h) Locked by ipi_hash_lock
*/
struct inpcbinfo {
/*
* Global lock protecting inpcb list modification
*/
struct mtx ipi_lock;
struct inpcbhead ipi_listhead; /* (r:e/w:g) */
u_int ipi_count; /* (g) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
u_quad_t ipi_gencnt; /* (g) */
/*
* Fields associated with port lookup and allocation.
*/
u_short ipi_lastport; /* (h) */
u_short ipi_lastlow; /* (h) */
u_short ipi_lasthi; /* (h) */
/*
* UMA zone from which inpcbs are allocated for this protocol.
*/
uma_zone_t ipi_zone; /* (c) */
uma_zone_t ipi_portzone; /* (c) */
smr_t ipi_smr; /* (c) */
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
* port numbers. The "exact" hash holds PCBs connected to a foreign
* address, and "wild" holds the rest.
*/
struct mtx ipi_hash_lock;
struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */
struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */
u_long ipi_hashmask; /* (c) */
/*
* Global hash of inpcbs, hashed by only local port number.
*/
- struct inpcbporthead *ipi_porthashbase; /* (h) */
+ struct inpcbhead *ipi_porthashbase; /* (h) */
u_long ipi_porthashmask; /* (h) */
/*
* Load balance groups used for the SO_REUSEPORT_LB option,
* hashed by local port.
*/
struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */
u_long ipi_lbgrouphashmask; /* (h) */
/*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
};
/*
* Global allocation storage for each high-level protocol (UDP, TCP, ...).
* Each corresponding per-VNET inpcbinfo points into this one.
*/
struct inpcbstorage {
uma_zone_t ips_zone;
- uma_zone_t ips_portzone;
uma_init ips_pcbinit;
size_t ips_size;
const char * ips_zone_name;
- const char * ips_portzone_name;
const char * ips_infolock_name;
const char * ips_hashlock_name;
};
#define INPCBSTORAGE_DEFINE(prot, ppcb, lname, zname, iname, hname) \
static int \
prot##_inpcb_init(void *mem, int size __unused, int flags __unused) \
{ \
struct inpcb *inp = mem; \
\
rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK); \
return (0); \
} \
static struct inpcbstorage prot = { \
.ips_size = sizeof(struct ppcb), \
.ips_pcbinit = prot##_inpcb_init, \
.ips_zone_name = zname, \
- .ips_portzone_name = zname " ports", \
.ips_infolock_name = iname, \
.ips_hashlock_name = hname, \
}; \
SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN, \
SI_ORDER_SECOND, in_pcbstorage_init, &prot); \
SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \
SI_ORDER_SECOND, in_pcbstorage_destroy, &prot)
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock)
#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock)
#define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock)
#define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock)
#define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock)
#define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock)
#define INP_UNLOCK(inp) rw_unlock(&(inp)->inp_lock)
#define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock)
#define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock)
#define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock)
#define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED)
#define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED)
#define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED)
#define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
/*
* These locking functions are for inpcb consumers outside of sys/netinet,
* more specifically, they were added for the benefit of TOE drivers. The
* macros are reserved for use by the stack.
*/
void inp_wlock(struct inpcb *);
void inp_wunlock(struct inpcb *);
void inp_rlock(struct inpcb *);
void inp_runlock(struct inpcb *);
#ifdef INVARIANT_SUPPORT
void inp_lock_assert(struct inpcb *);
void inp_unlock_assert(struct inpcb *);
#else
#define inp_lock_assert(inp) do {} while (0)
#define inp_unlock_assert(inp) do {} while (0)
#endif
void inp_apply_all(struct inpcbinfo *, void (*func)(struct inpcb *, void *),
void *arg);
struct socket *
inp_inpcbtosocket(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
#endif /* _KERNEL */
#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
mtx_owned(&(ipi)->ipi_lock))
#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
#define INP_INFO_WUNLOCK_ASSERT(ipi) \
mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
mtx_owned(&(ipi)->ipi_hash_lock))
#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \
MA_OWNED)
/*
* Wildcard matching hash is not just a microoptimisation! The hash for
* wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6
* wildcard bound pcb won't be able to receive AF_INET connections, while:
* jenkins_hash(&zeroes, 1, s) != jenkins_hash(&zeroes, 4, s)
* See also comment above struct in_addr_4in6.
*/
#define IN_ADDR_JHASH32(addr) \
((addr)->s_addr == INADDR_ANY ? V_in_pcbhashseed : \
jenkins_hash32((&(addr)->s_addr), 1, V_in_pcbhashseed))
#define IN6_ADDR_JHASH32(addr) \
(memcmp((addr), &in6addr_any, sizeof(in6addr_any)) == 0 ? \
V_in_pcbhashseed : \
jenkins_hash32((addr)->__u6_addr.__u6_addr32, \
nitems((addr)->__u6_addr.__u6_addr32), V_in_pcbhashseed))
#define INP_PCBHASH(faddr, lport, fport, mask) \
((IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP6_PCBHASH(faddr, lport, fport, mask) \
((IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBHASH_WILD(lport, mask) \
((V_in_pcbhashseed ^ ntohs(lport)) & (mask))
#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
(IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
#define INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
(IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
#define INP_PCBPORTHASH(lport, mask) (ntohs((lport)) & (mask))
/*
* Flags for inp_vflags -- historically version flags only
*/
#define INP_IPV4 0x1
#define INP_IPV6 0x2
#define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */
/*
* Flags for inp_flags.
*/
#define INP_RECVOPTS 0x00000001 /* receive incoming IP options */
#define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */
#define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */
#define INP_HDRINCL 0x00000008 /* user supplies entire IP header */
#define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */
#define INP_LOWPORT 0x00000020 /* user wants "low" port binding */
#define INP_ANONPORT 0x00000040 /* read by netstat(1) */
#define INP_RECVIF 0x00000080 /* receive incoming interface */
#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
/* INP_FREED 0x00000200 private to in_pcb.c */
#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
#define INP_BINDANY 0x00001000 /* allow bind to any address */
#define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */
#define INP_RECVTOS 0x00004000 /* receive incoming IP TOS */
#define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */
#define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */
#define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */
#define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */
#define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */
#define IN6P_RTHDR 0x00100000 /* receive routing header */
#define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */
#define IN6P_TCLASS 0x00400000 /* receive traffic class value */
#define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */
/* INP_INLBGROUP 0x01000000 private to in_pcb.c */
#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */
#define INP_DROPPED 0x04000000 /* protocol drop flag */
#define INP_SOCKREF 0x08000000 /* strong socket reference */
#define INP_RESERVED_0 0x10000000 /* reserved field */
#define INP_BOUNDFIB 0x20000000 /* Bound to a specific FIB. */
#define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */
#define IN6P_MTU 0x80000000 /* receive path MTU */
#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
IN6P_MTU)
/*
* Flags for inp_flags2.
*/
/* 0x00000001 */
/* 0x00000002 */
/* 0x00000004 */
/* 0x00000008 */
/* 0x00000010 */
/* 0x00000020 */
/* 0x00000040 */
/* 0x00000080 */
#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
/* 0x00001000 */
/* 0x00002000 */
/* 0x00004000 */
/* 0x00008000 */
/* 0x00010000 */
#define INP_2PCP_SET 0x00020000 /* If the Eth PCP should be set explicitly */
#define INP_2PCP_BIT0 0x00040000 /* Eth PCP Bit 0 */
#define INP_2PCP_BIT1 0x00080000 /* Eth PCP Bit 1 */
#define INP_2PCP_BIT2 0x00100000 /* Eth PCP Bit 2 */
#define INP_2PCP_BASE INP_2PCP_BIT0
#define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
#define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */
/*
* Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
*/
typedef enum {
INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */
INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */
INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */
INPLOOKUP_FIB = 0x00000008, /* inp must be from same FIB. */
} inp_lookup_t;
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
INPLOOKUP_WLOCKPCB | INPLOOKUP_FIB)
#define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
#ifdef _KERNEL
VNET_DECLARE(int, ipport_reservedhigh);
VNET_DECLARE(int, ipport_reservedlow);
VNET_DECLARE(int, ipport_lowfirstauto);
VNET_DECLARE(int, ipport_lowlastauto);
VNET_DECLARE(int, ipport_firstauto);
VNET_DECLARE(int, ipport_lastauto);
VNET_DECLARE(int, ipport_hifirstauto);
VNET_DECLARE(int, ipport_hilastauto);
VNET_DECLARE(int, ipport_randomized);
#define V_ipport_reservedhigh VNET(ipport_reservedhigh)
#define V_ipport_reservedlow VNET(ipport_reservedlow)
#define V_ipport_lowfirstauto VNET(ipport_lowfirstauto)
#define V_ipport_lowlastauto VNET(ipport_lowlastauto)
#define V_ipport_firstauto VNET(ipport_firstauto)
#define V_ipport_lastauto VNET(ipport_lastauto)
#define V_ipport_hifirstauto VNET(ipport_hifirstauto)
#define V_ipport_hilastauto VNET(ipport_hilastauto)
#define V_ipport_randomized VNET(ipport_randomized)
void in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *,
u_int, u_int);
void in_pcbinfo_destroy(struct inpcbinfo *);
void in_pcbstorage_init(void *);
void in_pcbstorage_destroy(void *);
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
int in_pcballoc(struct socket *, struct inpcbinfo *);
#define INPBIND_FIB 0x0001 /* bind to the PCB's FIB only */
int in_pcbbind(struct inpcb *, struct sockaddr_in *, int, struct ucred *);
int in_pcbbind_setup(struct inpcb *, struct sockaddr_in *, in_addr_t *,
u_short *, int, struct ucred *);
int in_pcbconnect(struct inpcb *, struct sockaddr_in *, struct ucred *);
void in_pcbdisconnect(struct inpcb *);
void in_pcbdrop(struct inpcb *);
void in_pcbfree(struct inpcb *);
int in_pcbladdr(const struct inpcb *, struct in_addr *, struct in_addr *,
struct ucred *);
int in_pcblbgroup_numa(struct inpcb *, int arg);
void in_pcblisten(struct inpcb *);
struct inpcb *
in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *);
struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
void in_pcbref(struct inpcb *);
bool in_pcbrele(struct inpcb *, inp_lookup_t);
bool in_pcbrele_rlocked(struct inpcb *);
bool in_pcbrele_wlocked(struct inpcb *);
typedef bool inp_match_t(const struct inpcb *, void *);
struct inpcb_iterator {
const struct inpcbinfo *ipi;
struct inpcb *inp;
inp_match_t *match;
void *ctx;
int hash;
#define INP_ALL_LIST -1
const inp_lookup_t lock;
};
/* Note: sparse initializers guarantee .inp = NULL. */
#define INP_ITERATOR(_ipi, _lock, _match, _ctx) \
{ \
.ipi = (_ipi), \
.lock = (_lock), \
.hash = INP_ALL_LIST, \
.match = (_match), \
.ctx = (_ctx), \
}
#define INP_ALL_ITERATOR(_ipi, _lock) \
{ \
.ipi = (_ipi), \
.lock = (_lock), \
.hash = INP_ALL_LIST, \
}
struct inpcb *inp_next(struct inpcb_iterator *);
void in_losing(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *, struct sockaddr *sa);
int in_getsockaddr(struct socket *, struct sockaddr *sa);
void in_pcbsosetlabel(struct socket *so);
#ifdef RATELIMIT
int
in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
struct mbuf *, uint32_t);
int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
uint32_t, struct m_snd_tag **);
void in_pcbdetach_txrtlmt(struct inpcb *);
void in_pcbdetach_tag(struct m_snd_tag *);
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
int in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
void in_pcboutput_eagain(struct inpcb *);
#endif
#endif /* _KERNEL */
#endif /* !_NETINET_IN_PCB_H_ */
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index fb88dfec889e..7e8a1626ab40 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -1,92 +1,86 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California.
* Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
* contract to Juniper Networks, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _NETINET_IN_PCB_VAR_H_
#define _NETINET_IN_PCB_VAR_H_
/*
* Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c
*/
VNET_DECLARE(uint32_t, in_pcbhashseed);
#define V_in_pcbhashseed VNET(in_pcbhashseed)
void inp_lock(struct inpcb *inp, const inp_lookup_t lock);
void inp_unlock(struct inpcb *inp, const inp_lookup_t lock);
int inp_trylock(struct inpcb *inp, const inp_lookup_t lock);
bool inp_smr_lock(struct inpcb *, const inp_lookup_t);
int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
struct ucred *, int);
int in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
u_short *lportp, struct sockaddr *fsa, u_short fport,
struct ucred *cred, int lookupflags);
struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short,
int, int, struct ucred *);
int in_pcbinshash(struct inpcb *);
void in_pcbrehash(struct inpcb *);
void in_pcbremhash_locked(struct inpcb *);
-struct inpcbport {
- struct inpcbhead phd_pcblist;
- CK_LIST_ENTRY(inpcbport) phd_hash;
- u_short phd_port;
-};
-
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
* INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
* is dynamically resized as processes bind/unbind to that specific group.
*/
struct inpcblbgroup {
CK_LIST_ENTRY(inpcblbgroup) il_list;
LIST_HEAD(, inpcb) il_pending; /* PCBs waiting for listen() */
struct epoch_context il_epoch_ctx;
struct ucred *il_cred;
uint16_t il_lport; /* (c) */
u_char il_vflag; /* (c) */
uint8_t il_numa_domain;
int il_fibnum;
union in_dependaddr il_dependladdr; /* (c) */
#define il_laddr il_dependladdr.id46_addr.ia46_addr4
#define il6_laddr il_dependladdr.id6_addr
uint32_t il_inpsiz; /* max count in il_inp[] (h) */
uint32_t il_inpcnt; /* cur count in il_inp[] (h) */
uint32_t il_pendcnt; /* cur count in il_pending (h) */
struct inpcb *il_inp[]; /* (h) */
};
#endif /* !_NETINET_IN_PCB_VAR_H_ */
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 64c886ca2ed5..e77a1e9d3e87 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -1,1312 +1,1301 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
* contract to Juniper Networks, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_route.h"
#include "opt_rss.h"
#include <sys/hash.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/smr.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/jail.h>
#include <vm/uma.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip6.h>
#include <netinet/ip_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet/in_pcb.h>
#include <netinet/in_pcb_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/in6_fib.h>
#include <netinet6/scope6_var.h>
SYSCTL_DECL(_net_inet6);
SYSCTL_DECL(_net_inet6_ip6);
VNET_DEFINE_STATIC(int, connect_in6addr_wild) = 1;
#define V_connect_in6addr_wild VNET(connect_in6addr_wild)
SYSCTL_INT(_net_inet6_ip6, OID_AUTO, connect_in6addr_wild,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_in6addr_wild), 0,
"Allow connecting to the unspecified address for connect(2)");
int
in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred)
{
struct socket *so = inp->inp_socket;
u_int16_t lport = 0;
int error, lookupflags = 0;
#ifdef INVARIANTS
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
#endif
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
error = prison_local_ip6(cred, laddr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0));
if (error)
return(error);
/* XXX: this is redundant when called from in6_pcbbind */
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
inp->inp_flags |= INP_ANONPORT;
error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags);
if (error != 0)
return (error);
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
inp->in6p_laddr = in6addr_any;
inp->inp_lport = 0;
return (EAGAIN);
}
return (0);
}
/*
* Determine whether the inpcb can be bound to the specified address/port tuple.
*/
static int
in6_pcbbind_avail(struct inpcb *inp, const struct sockaddr_in6 *sin6, int fib,
int sooptions, int lookupflags, struct ucred *cred)
{
const struct in6_addr *laddr;
int reuseport, reuseport_lb;
u_short lport;
INP_LOCK_ASSERT(inp);
INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
laddr = &sin6->sin6_addr;
lport = sin6->sin6_port;
reuseport = (sooptions & SO_REUSEPORT);
reuseport_lb = (sooptions & SO_REUSEPORT_LB);
if (IN6_IS_ADDR_MULTICAST(laddr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow compepte duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR | SO_REUSEPORT;
/*
* XXX: How to deal with SO_REUSEPORT_LB here?
* Treat same as SO_REUSEPORT for now.
*/
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
} else if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) {
struct sockaddr_in6 sin6;
struct epoch_tracker et;
struct ifaddr *ifa;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(sin6);
sin6.sin6_addr = *laddr;
NET_EPOCH_ENTER(et);
if ((ifa = ifa_ifwithaddr((const struct sockaddr *)&sin6)) ==
NULL && (inp->inp_flags & INP_BINDANY) == 0) {
NET_EPOCH_EXIT(et);
return (EADDRNOTAVAIL);
}
/*
* XXX: bind to an anycast address might accidentally
* cause sending a packet with anycast source address.
* We should allow to bind to a deprecated address, since
* the application dares to use it.
*/
if (ifa != NULL &&
((struct in6_ifaddr *)ifa)->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_DETACHED)) {
NET_EPOCH_EXIT(et);
return (EADDRNOTAVAIL);
}
NET_EPOCH_EXIT(et);
}
if (lport != 0) {
struct inpcb *t;
if (ntohs(lport) <= V_ipport_reservedhigh &&
ntohs(lport) >= V_ipport_reservedlow &&
priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
return (EACCES);
if (!IN6_IS_ADDR_MULTICAST(laddr) &&
priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) !=
0) {
/*
* If a socket owned by a different user is already
* bound to this port, fail. In particular, SO_REUSE*
* can only be used to share a port among sockets owned
* by the same user.
*
* However, we can share a port with a connected socket
* which has a unique 4-tuple.
*/
t = in6_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
if (t != NULL &&
(inp->inp_socket->so_type != SOCK_STREAM ||
IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
(inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
return (EADDRINUSE);
#ifdef INET
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
IN6_IS_ADDR_UNSPECIFIED(laddr)) {
struct sockaddr_in sin;
in6_sin6_2_sin(&sin, sin6);
t = in_pcblookup_local(inp->inp_pcbinfo,
sin.sin_addr, lport, RT_ALL_FIBS,
INPLOOKUP_WILDCARD, cred);
if (t != NULL &&
(inp->inp_socket->so_type != SOCK_STREAM ||
in_nullhost(t->inp_faddr)) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
}
#endif
}
t = in6_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
fib, lookupflags, cred);
if (t != NULL && ((reuseport | reuseport_lb) &
t->inp_socket->so_options) == 0)
return (EADDRINUSE);
#ifdef INET
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
IN6_IS_ADDR_UNSPECIFIED(laddr)) {
struct sockaddr_in sin;
in6_sin6_2_sin(&sin, sin6);
t = in_pcblookup_local(inp->inp_pcbinfo, sin.sin_addr,
lport, RT_ALL_FIBS, lookupflags, cred);
if (t != NULL && ((reuseport | reuseport_lb) &
t->inp_socket->so_options) == 0 &&
(!in_nullhost(t->inp_laddr) ||
(t->inp_vflag & INP_IPV6PROTO) != 0)) {
return (EADDRINUSE);
}
}
#endif
}
return (0);
}
int
in6_pcbbind(struct inpcb *inp, struct sockaddr_in6 *sin6, int flags,
struct ucred *cred)
{
struct socket *so = inp->inp_socket;
u_short lport = 0;
int error, fib, lookupflags, sooptions;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
return (EINVAL);
lookupflags = 0;
sooptions = atomic_load_int(&so->so_options);
if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (sin6 == NULL) {
if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
return (error);
} else {
KASSERT(sin6->sin6_family == AF_INET6,
("%s: invalid address family for %p", __func__, sin6));
KASSERT(sin6->sin6_len == sizeof(*sin6),
("%s: invalid address length for %p", __func__, sin6));
if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
return(error);
if ((error = prison_local_ip6(cred, &sin6->sin6_addr,
((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
return (error);
fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
RT_ALL_FIBS;
/* See if this address/port combo is available. */
error = in6_pcbbind_avail(inp, sin6, fib, sooptions, lookupflags,
cred);
if (error != 0)
return (error);
lport = sin6->sin6_port;
inp->in6p_laddr = sin6->sin6_addr;
}
if ((flags & INPBIND_FIB) != 0)
inp->inp_flags |= INP_BOUNDFIB;
if (lport == 0) {
if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) {
/* Undo an address bind that may have occurred. */
inp->inp_flags &= ~INP_BOUNDFIB;
inp->in6p_laddr = in6addr_any;
return (error);
}
} else {
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
inp->inp_flags &= ~INP_BOUNDFIB;
inp->in6p_laddr = in6addr_any;
inp->inp_lport = 0;
return (EAGAIN);
}
}
return (0);
}
/*
* Transform old in6_pcbconnect() into an inner subroutine for new
* in6_pcbconnect(): Do some validity-checking on the remote
* address (in mbuf 'nam') and then determine local host address
* (i.e., which interface) to use to access that remote host.
*
* This preserves definition of in6_pcbconnect(), while supporting a
* slightly different version for T/TCP. (This is more than
* a bit of a kludge, but cleaning up the internal interfaces would
* have forced minor changes in every protocol).
*/
static int
in6_pcbladdr(struct inpcb *inp, struct sockaddr_in6 *sin6,
struct in6_addr *plocal_addr6, bool sas_required)
{
int error = 0;
int scope_ambiguous = 0;
struct in6_addr in6a;
NET_EPOCH_ASSERT();
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */
if (sin6->sin6_port == 0)
return (EADDRNOTAVAIL);
if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
return(error);
if (V_connect_in6addr_wild && !CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) {
/*
* If the destination address is UNSPECIFIED addr,
* use the loopback addr, e.g ::1.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
sin6->sin6_addr = in6addr_loopback;
} else if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
return (ENETUNREACH);
}
if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0)
return (error);
if (sas_required) {
error = in6_selectsrc_socket(sin6, inp->in6p_outputopts,
inp, inp->inp_cred, scope_ambiguous, &in6a, NULL);
if (error)
return (error);
} else {
/*
* Source address selection isn't required when syncache
* has already established connection and both source and
* destination addresses was chosen.
*
* This also includes the case when fwd_tag was used to
* select source address in tcp_input().
*/
in6a = inp->in6p_laddr;
}
if (IN6_IS_ADDR_UNSPECIFIED(&in6a))
return (EHOSTUNREACH);
/*
* Do not update this earlier, in case we return with an error.
*
* XXX: this in6_selectsrc_socket result might replace the bound local
* address with the address specified by setsockopt(IPV6_PKTINFO).
* Is it the intended behavior?
*/
*plocal_addr6 = in6a;
/*
* Don't do pcblookup call here; return interface in
* plocal_addr6
* and exit to caller, that will do the lookup.
*/
return (0);
}
/*
* Outer subroutine:
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
bool sas_required)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct sockaddr_in6 laddr6;
int error;
NET_EPOCH_ASSERT();
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
KASSERT(sin6->sin6_family == AF_INET6,
("%s: invalid address family for %p", __func__, sin6));
KASSERT(sin6->sin6_len == sizeof(*sin6),
("%s: invalid address length for %p", __func__, sin6));
KASSERT(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr),
("%s: inp is already connected", __func__));
bzero(&laddr6, sizeof(laddr6));
laddr6.sin6_family = AF_INET6;
#ifdef ROUTE_MPATH
if (CALC_FLOWID_OUTBOUND) {
uint32_t hash_type, hash_val;
hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
&sin6->sin6_addr, 0, sin6->sin6_port,
inp->inp_socket->so_proto->pr_protocol, &hash_type);
inp->inp_flowid = hash_val;
inp->inp_flowtype = hash_type;
}
#endif
/*
* Call inner routine, to assign local interface address.
* in6_pcbladdr() may automatically fill in sin6_scope_id.
*/
if ((error = in6_pcbladdr(inp, sin6, &laddr6.sin6_addr,
sas_required)) != 0)
return (error);
if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr,
sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ?
&laddr6.sin6_addr : &inp->in6p_laddr, inp->inp_lport, 0,
M_NODOM, RT_ALL_FIBS) != NULL)
return (EADDRINUSE);
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
if (inp->inp_lport == 0) {
error = in_pcb_lport_dest(inp,
(struct sockaddr *) &laddr6, &inp->inp_lport,
(struct sockaddr *) sin6, sin6->sin6_port, cred,
INPLOOKUP_WILDCARD);
if (error)
return (error);
}
inp->in6p_laddr = laddr6.sin6_addr;
}
inp->in6p_faddr = sin6->sin6_addr;
inp->inp_fport = sin6->sin6_port;
/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
inp->inp_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
if ((inp->inp_flags & INP_INHASHLIST) != 0) {
in_pcbrehash(inp);
} else {
in_pcbinshash(inp);
}
return (0);
}
void
in6_pcbdisconnect(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp %p was already disconnected", __func__, inp));
in_pcbremhash_locked(inp);
/* See the comment in in_pcbinshash(). */
inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
/* XXX-MJ torn writes are visible to SMR lookup */
memset(&inp->in6p_laddr, 0, sizeof(inp->in6p_laddr));
memset(&inp->in6p_faddr, 0, sizeof(inp->in6p_faddr));
inp->inp_fport = 0;
/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
}
int
in6_getsockaddr(struct socket *so, struct sockaddr *sa)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL"));
*(struct sockaddr_in6 *)sa = (struct sockaddr_in6 ){
.sin6_len = sizeof(struct sockaddr_in6),
.sin6_family = AF_INET6,
.sin6_port = inp->inp_lport,
.sin6_addr = inp->in6p_laddr,
};
/* XXX: should catch errors */
(void)sa6_recoverscope((struct sockaddr_in6 *)sa);
return (0);
}
int
in6_getpeeraddr(struct socket *so, struct sockaddr *sa)
{
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL"));
*(struct sockaddr_in6 *)sa = (struct sockaddr_in6 ){
.sin6_len = sizeof(struct sockaddr_in6),
.sin6_family = AF_INET6,
.sin6_port = inp->inp_fport,
.sin6_addr = inp->in6p_faddr,
};
/* XXX: should catch errors */
(void)sa6_recoverscope((struct sockaddr_in6 *)sa);
return (0);
}
int
in6_mapped_sockaddr(struct socket *so, struct sockaddr *sa)
{
int error;
#ifdef INET
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL"));
if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
struct sockaddr_in sin;
error = in_getsockaddr(so, (struct sockaddr *)&sin);
if (error == 0)
in6_sin_2_v4mapsin6(&sin, (struct sockaddr_in6 *)sa);
} else
#endif
{
/* scope issues will be handled in in6_getsockaddr(). */
error = in6_getsockaddr(so, sa);
}
return error;
}
int
in6_mapped_peeraddr(struct socket *so, struct sockaddr *sa)
{
int error;
#ifdef INET
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL"));
if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
struct sockaddr_in sin;
error = in_getpeeraddr(so, (struct sockaddr *)&sin);
if (error == 0)
in6_sin_2_v4mapsin6(&sin, (struct sockaddr_in6 *)sa);
} else
#endif
{
/* scope issues will be handled in in6_getpeeraddr(). */
error = in6_getpeeraddr(so, sa);
}
return error;
}
/*
* Pass some notification to all connections of a protocol
* associated with address dst. The local address and/or port numbers
* may be specified to limit the search. The "usual action" will be
* taken, depending on the ctlinput cmd. The caller must filter any
* cmds that are uninteresting (e.g., no error in the map).
* Call the protocol specific routine (if any) to report
* any errors for each matching socket.
*/
static bool
inp_match6(const struct inpcb *inp, void *v __unused)
{
return ((inp->inp_vflag & INP_IPV6) != 0);
}
void
in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr_in6 *sa6_dst,
u_int fport_arg, const struct sockaddr_in6 *src, u_int lport_arg,
int errno, void *cmdarg,
struct inpcb *(*notify)(struct inpcb *, int))
{
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
inp_match6, NULL);
struct inpcb *inp;
struct sockaddr_in6 sa6_src;
u_short fport = fport_arg, lport = lport_arg;
u_int32_t flowinfo;
if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
return;
/*
* note that src can be NULL when we get notify by local fragmentation.
*/
sa6_src = (src == NULL) ? sa6_any : *src;
flowinfo = sa6_src.sin6_flowinfo;
while ((inp = inp_next(&inpi)) != NULL) {
INP_WLOCK_ASSERT(inp);
/*
* If the error designates a new path MTU for a destination
* and the application (associated with this socket) wanted to
* know the value, notify.
* XXX: should we avoid to notify the value to TCP sockets?
*/
if (errno == EMSGSIZE && cmdarg != NULL)
ip6_notify_pmtu(inp, sa6_dst, *(uint32_t *)cmdarg);
/*
* Detect if we should notify the error. If no source and
* destination ports are specified, but non-zero flowinfo and
* local address match, notify the error. This is the case
* when the error is delivered with an encrypted buffer
* by ESP. Otherwise, just compare addresses and ports
* as usual.
*/
if (lport == 0 && fport == 0 && flowinfo &&
inp->inp_socket != NULL &&
flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) &&
IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr))
goto do_notify;
else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
&sa6_dst->sin6_addr) ||
inp->inp_socket == 0 ||
(lport && inp->inp_lport != lport) ||
(!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
&sa6_src.sin6_addr)) ||
(fport && inp->inp_fport != fport)) {
continue;
}
do_notify:
if (notify)
(*notify)(inp, errno);
}
}
/*
* Lookup a PCB based on the local address and port. Caller must hold the
* hash lock. No inpcb locks or references are acquired.
*/
struct inpcb *
in6_pcblookup_local(struct inpcbinfo *pcbinfo, const struct in6_addr *laddr,
u_short lport, int fib, int lookupflags, struct ucred *cred)
{
struct inpcb *inp;
int matchwild = 3, wildcard;
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
("%s: invalid fib %d", __func__, fib));
INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
struct inpcbhead *head;
/*
* Look for an unconnected (wildcard foreign addr) PCB that
* matches the local address and port we're looking for.
*/
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV6) == 0)
continue;
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
inp->inp_inc.inc_fibnum == fib)) {
/* Found. */
if (prison_equal_ip6(cred->cr_prison,
inp->inp_cred->cr_prison))
return (inp);
}
}
/*
* Not found.
*/
return (NULL);
} else {
- struct inpcbporthead *porthash;
- struct inpcbport *phd;
+ struct inpcbhead *porthash;
struct inpcb *match = NULL;
+
/*
- * Best fit PCB lookup.
- *
- * First see if this local port is in use by looking on the
- * port hash list.
+ * Port is in use by one or more PCBs. Look for best
+ * fit.
*/
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
pcbinfo->ipi_porthashmask)];
- CK_LIST_FOREACH(phd, porthash, phd_hash) {
- if (phd->phd_port == lport)
- break;
- }
- if (phd != NULL) {
- /*
- * Port is in use by one or more PCBs. Look for best
- * fit.
- */
- CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
- wildcard = 0;
- if (!prison_equal_ip6(cred->cr_prison,
- inp->inp_cred->cr_prison))
- continue;
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (fib != RT_ALL_FIBS &&
- inp->inp_inc.inc_fibnum != fib)
+ CK_LIST_FOREACH(inp, porthash, inp_portlist) {
+ if (inp->inp_lport != lport)
+ continue;
+ if (!prison_equal_ip6(cred->cr_prison,
+ inp->inp_cred->cr_prison))
+ continue;
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ continue;
+ if (fib != RT_ALL_FIBS &&
+ inp->inp_inc.inc_fibnum != fib)
+ continue;
+ wildcard = 0;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
+ wildcard++;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(laddr))
+ wildcard++;
+ else if (!IN6_ARE_ADDR_EQUAL(
+ &inp->in6p_laddr, laddr))
continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
+ } else {
+ if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
wildcard++;
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &inp->in6p_laddr)) {
- if (IN6_IS_ADDR_UNSPECIFIED(laddr))
- wildcard++;
- else if (!IN6_ARE_ADDR_EQUAL(
- &inp->in6p_laddr, laddr))
- continue;
- } else {
- if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
- wildcard++;
- }
- if (wildcard < matchwild) {
- match = inp;
- matchwild = wildcard;
- if (matchwild == 0)
- break;
- }
+ }
+ if (wildcard < matchwild) {
+ match = inp;
+ matchwild = wildcard;
+ if (matchwild == 0)
+ break;
}
}
return (match);
}
}
static bool
in6_multi_match(const struct inpcb *inp, void *v __unused)
{
if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL)
return (true);
else
return (false);
}
void
in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB,
in6_multi_match, NULL);
struct inpcb *inp;
struct in6_multi *inm;
struct in6_mfilter *imf;
struct ip6_moptions *im6o;
IN6_MULTI_LOCK_ASSERT();
while ((inp = inp_next(&inpi)) != NULL) {
INP_RLOCK_ASSERT(inp);
im6o = inp->in6p_moptions;
/*
* Unselect the outgoing ifp for multicast if it
* is being detached.
*/
if (im6o->im6o_multicast_ifp == ifp)
im6o->im6o_multicast_ifp = NULL;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
*/
restart:
IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
if ((inm = imf->im6f_in6m) == NULL)
continue;
if (inm->in6m_ifp != ifp)
continue;
ip6_mfilter_remove(&im6o->im6o_head, imf);
in6_leavegroup_locked(inm, NULL);
ip6_mfilter_free(imf);
goto restart;
}
}
}
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
* routing information. If the route was created dynamically
* (by a redirect), time to try a default gateway again.
*/
void
in6_losing(struct inpcb *inp)
{
RO_INVALIDATE_CACHE(&inp->inp_route6);
}
/*
* After a routing change, flush old routing
* and allocate a (hopefully) better one.
*/
struct inpcb *
in6_rtchange(struct inpcb *inp, int errno __unused)
{
RO_INVALIDATE_CACHE(&inp->inp_route6);
return inp;
}
static bool
in6_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
{
return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
(fib == RT_ALL_FIBS || fib == grp->il_fibnum));
}
static struct inpcb *
in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in6_addr *faddr, uint16_t fport, const struct in6_addr *laddr,
uint16_t lport, uint8_t domain, int fib)
{
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
struct inpcb *inp;
u_int count;
INP_HASH_LOCK_ASSERT(pcbinfo);
NET_EPOCH_ASSERT();
hdr = &pcbinfo->ipi_lbgrouphashbase[
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
/*
* Search for an LB group match based on the following criteria:
* - prefer jailed groups to non-jailed groups
* - prefer exact source address matches to wildcard matches
* - prefer groups bound to the specified NUMA domain
*/
jail_exact = jail_wild = local_exact = local_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
bool injail;
#ifdef INET
if (!(grp->il_vflag & INP_IPV6))
continue;
#endif
if (grp->il_lport != lport)
continue;
injail = prison_flag(grp->il_cred, PR_IP6) != 0;
if (injail && prison_check_ip6_locked(grp->il_cred->cr_prison,
laddr) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
if (injail) {
jail_exact = grp;
if (in6_pcblookup_lb_match(grp, domain, fib))
/* This is a perfect match. */
goto out;
} else if (local_exact == NULL ||
in6_pcblookup_lb_match(grp, domain, fib)) {
local_exact = grp;
}
} else if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr)) {
if (injail) {
if (jail_wild == NULL ||
in6_pcblookup_lb_match(grp, domain, fib))
jail_wild = grp;
} else if (local_wild == NULL ||
in6_pcblookup_lb_match(grp, domain, fib)) {
local_wild = grp;
}
}
}
if (jail_exact != NULL)
grp = jail_exact;
else if (jail_wild != NULL)
grp = jail_wild;
else if (local_exact != NULL)
grp = local_exact;
else
grp = local_wild;
if (grp == NULL)
return (NULL);
out:
/*
* Synchronize with in_pcblbgroup_insert().
*/
count = atomic_load_acq_int(&grp->il_inpcnt);
if (count == 0)
return (NULL);
inp = grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
return (inp);
}
static bool
in6_pcblookup_exact_match(const struct inpcb *inp, const struct in6_addr *faddr,
u_short fport, const struct in6_addr *laddr, u_short lport)
{
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV6) == 0)
return (false);
if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
inp->inp_fport == fport && inp->inp_lport == lport)
return (true);
return (false);
}
static struct inpcb *
in6_pcblookup_hash_exact(struct inpcbinfo *pcbinfo,
const struct in6_addr *faddr, u_short fport,
const struct in6_addr *laddr, u_short lport)
{
struct inpcbhead *head;
struct inpcb *inp;
INP_HASH_LOCK_ASSERT(pcbinfo);
/*
* First look for an exact match.
*/
head = &pcbinfo->ipi_hash_exact[INP6_PCBHASH(faddr, lport, fport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash_exact) {
if (in6_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
return (inp);
}
return (NULL);
}
typedef enum {
INPLOOKUP_MATCH_NONE = 0,
INPLOOKUP_MATCH_WILD = 1,
INPLOOKUP_MATCH_LADDR = 2,
} inp_lookup_match_t;
static inp_lookup_match_t
in6_pcblookup_wild_match(const struct inpcb *inp, const struct in6_addr *laddr,
u_short lport, int fib)
{
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV6) == 0)
return (INPLOOKUP_MATCH_NONE);
if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
inp->inp_lport != lport)
return (INPLOOKUP_MATCH_NONE);
if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
return (INPLOOKUP_MATCH_NONE);
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
return (INPLOOKUP_MATCH_WILD);
if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr))
return (INPLOOKUP_MATCH_LADDR);
return (INPLOOKUP_MATCH_NONE);
}
#define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
static struct inpcb *
in6_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo,
const struct in6_addr *laddr, u_short lport, int fib,
const inp_lookup_t lockflags)
{
struct inpcbhead *head;
struct inpcb *inp;
KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
("%s: not in SMR read section", __func__));
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
pcbinfo->ipi_hashmask)];
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
inp_lookup_match_t match;
match = in6_pcblookup_wild_match(inp, laddr, lport, fib);
if (match == INPLOOKUP_MATCH_NONE)
continue;
if (__predict_true(inp_smr_lock(inp, lockflags))) {
match = in6_pcblookup_wild_match(inp, laddr, lport,
fib);
if (match != INPLOOKUP_MATCH_NONE &&
prison_check_ip6_locked(inp->inp_cred->cr_prison,
laddr) == 0)
return (inp);
inp_unlock(inp, lockflags);
}
/*
* The matching socket disappeared out from under us. Fall back
* to a serialized lookup.
*/
return (INP_LOOKUP_AGAIN);
}
return (NULL);
}
static struct inpcb *
in6_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo,
const struct in6_addr *laddr, u_short lport, int fib)
{
struct inpcbhead *head;
struct inpcb *inp, *jail_wild, *local_exact, *local_wild;
INP_HASH_LOCK_ASSERT(pcbinfo);
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
* 2. jailed, wild.
* 3. non-jailed, non-wild.
* 4. non-jailed, wild.
*/
head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
pcbinfo->ipi_hashmask)];
local_wild = local_exact = jail_wild = NULL;
CK_LIST_FOREACH(inp, head, inp_hash_wild) {
inp_lookup_match_t match;
bool injail;
match = in6_pcblookup_wild_match(inp, laddr, lport, fib);
if (match == INPLOOKUP_MATCH_NONE)
continue;
injail = prison_flag(inp->inp_cred, PR_IP6) != 0;
if (injail) {
if (prison_check_ip6_locked(
inp->inp_cred->cr_prison, laddr) != 0)
continue;
} else {
if (local_exact != NULL)
continue;
}
if (match == INPLOOKUP_MATCH_LADDR) {
if (injail)
return (inp);
else
local_exact = inp;
} else {
if (injail)
jail_wild = inp;
else
local_wild = inp;
}
}
if (jail_wild != NULL)
return (jail_wild);
if (local_exact != NULL)
return (local_exact);
if (local_wild != NULL)
return (local_wild);
return (NULL);
}
struct inpcb *
in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
const struct in6_addr *faddr, u_int fport_arg,
const struct in6_addr *laddr, u_int lport_arg,
int lookupflags, uint8_t numa_domain, int fib)
{
struct inpcb *inp;
u_short fport = fport_arg, lport = lport_arg;
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT(!IN6_IS_ADDR_UNSPECIFIED(faddr),
("%s: invalid foreign address", __func__));
KASSERT(!IN6_IS_ADDR_UNSPECIFIED(laddr),
("%s: invalid local address", __func__));
INP_HASH_LOCK_ASSERT(pcbinfo);
inp = in6_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
if (inp != NULL)
return (inp);
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in6_pcblookup_lbgroup(pcbinfo, faddr, fport, laddr,
lport, numa_domain, fib);
if (inp == NULL) {
inp = in6_pcblookup_hash_wild_locked(pcbinfo,
laddr, lport, fib);
}
}
return (inp);
}
static struct inpcb *
in6_pcblookup_hash(struct inpcbinfo *pcbinfo, const struct in6_addr *faddr,
u_int fport, const struct in6_addr *laddr, u_int lport, int lookupflags,
uint8_t numa_domain, int fib)
{
struct inpcb *inp;
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
INP_HASH_WLOCK(pcbinfo);
inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
if (inp != NULL && !inp_trylock(inp, lockflags)) {
in_pcbref(inp);
INP_HASH_WUNLOCK(pcbinfo);
inp_lock(inp, lockflags);
if (in_pcbrele(inp, lockflags))
/* XXX-MJ or retry until we get a negative match? */
inp = NULL;
} else {
INP_HASH_WUNLOCK(pcbinfo);
}
return (inp);
}
static struct inpcb *
in6_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, const struct in6_addr *faddr,
u_int fport_arg, const struct in6_addr *laddr, u_int lport_arg,
int lookupflags, uint8_t numa_domain, int fib)
{
struct inpcb *inp;
const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
const u_short fport = fport_arg, lport = lport_arg;
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
smr_enter(pcbinfo->ipi_smr);
inp = in6_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
if (inp != NULL) {
if (__predict_true(inp_smr_lock(inp, lockflags))) {
if (__predict_true(in6_pcblookup_exact_match(inp,
faddr, fport, laddr, lport)))
return (inp);
inp_unlock(inp, lockflags);
}
/*
* We failed to lock the inpcb, or its connection state changed
* out from under us. Fall back to a precise search.
*/
return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
lookupflags, numa_domain, fib));
}
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in6_pcblookup_lbgroup(pcbinfo, faddr, fport,
laddr, lport, numa_domain, fib);
if (inp != NULL) {
if (__predict_true(inp_smr_lock(inp, lockflags))) {
if (__predict_true(in6_pcblookup_wild_match(inp,
laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
return (inp);
inp_unlock(inp, lockflags);
}
inp = INP_LOOKUP_AGAIN;
} else {
inp = in6_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
fib, lockflags);
}
if (inp == INP_LOOKUP_AGAIN) {
return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr,
lport, lookupflags, numa_domain, fib));
}
}
if (inp == NULL)
smr_exit(pcbinfo->ipi_smr);
return (inp);
}
/*
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
* from which a pre-calculated hash value may be extracted.
*/
struct inpcb *
in6_pcblookup(struct inpcbinfo *pcbinfo, const struct in6_addr *faddr,
u_int fport, const struct in6_addr *laddr, u_int lport, int lookupflags,
struct ifnet *ifp)
{
int fib;
fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
return (in6_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
lookupflags, M_NODOM, fib));
}
struct inpcb *
in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, const struct in6_addr *faddr,
u_int fport, const struct in6_addr *laddr, u_int lport, int lookupflags,
struct ifnet *ifp __unused, struct mbuf *m)
{
int fib;
M_ASSERTPKTHDR(m);
fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
return (in6_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
lookupflags, m->m_pkthdr.numa_domain, fib));
}
void
init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int srcordst)
{
struct ip6_hdr *ip;
ip = mtod(m, struct ip6_hdr *);
bzero(sin6, sizeof(*sin6));
sin6->sin6_len = sizeof(*sin6);
sin6->sin6_family = AF_INET6;
sin6->sin6_addr = srcordst ? ip->ip6_dst : ip->ip6_src;
(void)sa6_recoverscope(sin6); /* XXX: should catch errors... */
return;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Apr 27, 8:59 PM (11 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28434936
Default Alt Text
(168 KB)
Attached To
Mode
rG FreeBSD src repository
Attached
Detach File
Event Timeline
Log In to Comment