Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F144591433
D3687.1775618921.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
28 KB
Referenced Files
None
Subscribers
None
D3687.1775618921.diff
View Options
Index: lib/libc/sys/getsockopt.2
===================================================================
--- lib/libc/sys/getsockopt.2
+++ lib/libc/sys/getsockopt.2
@@ -28,7 +28,7 @@
.\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95
.\" $FreeBSD$
.\"
-.Dd April 5, 2013
+.Dd January 18, 2017
.Dt GETSOCKOPT 2
.Os
.Sh NAME
@@ -188,6 +188,7 @@
.It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)"
.It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)"
.It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by SO_TIMESTAMP"
+.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket"
.El
.Pp
.Dv SO_DEBUG
@@ -515,6 +516,10 @@
returns the number of unaccepted complete connections.
.Dv SO_LISTENINCQLEN
returns the number of unaccepted incomplete connections.
+.Pp
+.Dv SO_MAX_PACING_RATE
+instruct the socket and underlying network adapter layers to limit the
+transfer rate to the given unsigned 32-bit value in bytes per second.
.Sh RETURN VALUES
.Rv -std
.Sh ERRORS
Index: sbin/ifconfig/ifconfig.8
===================================================================
--- sbin/ifconfig/ifconfig.8
+++ sbin/ifconfig/ifconfig.8
@@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
-.Dd September 17, 2016
+.Dd January 18, 2017
.Dt IFCONFIG 8
.Os
.Sh NAME
@@ -460,6 +460,8 @@
and 802.11g
.Pq Cm 11g
operating modes.
+.It Cm txrtlmt
+Set if the driver supports TX rate limiting.
.It Cm inst Ar minst , Cm instance Ar minst
Set the media instance to
.Ar minst .
Index: sbin/ifconfig/ifconfig.c
===================================================================
--- sbin/ifconfig/ifconfig.c
+++ sbin/ifconfig/ifconfig.c
@@ -1145,7 +1145,7 @@
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT"
/*
* Print the status of the interface. If an address family was
@@ -1453,6 +1453,8 @@
DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap),
DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap),
DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap),
+ DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap),
+ DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap),
DEF_CMD("normal", -IFF_LINK0, setifflags),
DEF_CMD("compress", IFF_LINK0, setifflags),
DEF_CMD("noicmp", IFF_LINK1, setifflags),
Index: sys/conf/NOTES
===================================================================
--- sys/conf/NOTES
+++ sys/conf/NOTES
@@ -619,6 +619,8 @@
options INET #Internet communications protocols
options INET6 #IPv6 communications protocols
+options RATELIMIT # TX rate limiting support
+
options ROUTETABLES=2 # allocated fibs up to 65536. default is 1.
# but that would be a bad idea as they are large.
Index: sys/conf/config.mk
===================================================================
--- sys/conf/config.mk
+++ sys/conf/config.mk
@@ -19,6 +19,10 @@
opt_inet6.h:
@echo "#define INET6 1" > ${.TARGET}
.endif
+.if ${MK_RATELIMIT} != "no"
+opt_ratelimit.h:
+ @echo "#define RATELIMIT 1" > ${.TARGET}
+.endif
.if ${MK_EISA} != "no"
opt_eisa.h:
@echo "#define DEV_EISA 1" > ${.TARGET}
Index: sys/conf/kern.opts.mk
===================================================================
--- sys/conf/kern.opts.mk
+++ sys/conf/kern.opts.mk
@@ -48,6 +48,7 @@
EXTRA_TCP_STACKS \
NAND \
OFED \
+ RATELIMIT \
REPRODUCIBLE_BUILD
# Some options are totally broken on some architectures. We disable
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -412,6 +412,7 @@
BOOTP_WIRED_TO opt_bootp.h
DEVICE_POLLING
DUMMYNET opt_ipdn.h
+RATELIMIT opt_ratelimit.h
INET opt_inet.h
INET6 opt_inet6.h
IPDIVERT
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -2699,6 +2699,14 @@
so->so_ts_clock = optval;
break;
+ case SO_MAX_PACING_RATE:
+ error = sooptcopyin(sopt, &val32, sizeof(val32),
+ sizeof(val32));
+ if (error)
+ goto bad;
+ so->so_max_pacing_rate = val32;
+ break;
+
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
@@ -2890,6 +2898,10 @@
optval = so->so_ts_clock;
goto integer;
+ case SO_MAX_PACING_RATE:
+ optval = so->so_max_pacing_rate;
+ goto integer;
+
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
Index: sys/modules/if_lagg/Makefile
===================================================================
--- sys/modules/if_lagg/Makefile
+++ sys/modules/if_lagg/Makefile
@@ -2,6 +2,6 @@
.PATH: ${.CURDIR}/../../net
KMOD= if_lagg
-SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h
+SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h opt_ratelimit.h
.include <bsd.kmod.mk>
Index: sys/modules/if_vlan/Makefile
===================================================================
--- sys/modules/if_vlan/Makefile
+++ sys/modules/if_vlan/Makefile
@@ -4,6 +4,6 @@
KMOD= if_vlan
SRCS= if_vlan.c
-SRCS+= opt_inet.h opt_vlan.h
+SRCS+= opt_inet.h opt_vlan.h opt_ratelimit.h
.include <bsd.kmod.mk>
Index: sys/net/ieee8023ad_lacp.h
===================================================================
--- sys/net/ieee8023ad_lacp.h
+++ sys/net/ieee8023ad_lacp.h
@@ -284,6 +284,9 @@
struct mbuf *lacp_input(struct lagg_port *, struct mbuf *);
struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
+#ifdef RATELIMIT
+struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t);
+#endif
void lacp_attach(struct lagg_softc *);
void lacp_detach(void *);
void lacp_init(struct lagg_softc *);
Index: sys/net/ieee8023ad_lacp.c
===================================================================
--- sys/net/ieee8023ad_lacp.c
+++ sys/net/ieee8023ad_lacp.c
@@ -30,6 +30,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_ratelimit.h"
+
#include <sys/param.h>
#include <sys/callout.h>
#include <sys/eventhandler.h>
@@ -853,6 +855,35 @@
return (lp->lp_lagg);
}
+
+#ifdef RATELIMIT
+struct lagg_port *
+lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid)
+{
+ struct lacp_softc *lsc = LACP_SOFTC(sc);
+ struct lacp_portmap *pm;
+ struct lacp_port *lp;
+ uint32_t hash;
+
+ if (__predict_false(lsc->lsc_suppress_distributing)) {
+ LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
+ return (NULL);
+ }
+
+ pm = &lsc->lsc_pmap[lsc->lsc_activemap];
+ if (pm->pm_count == 0) {
+ LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
+ return (NULL);
+ }
+
+ hash = flowid >> sc->flowid_shift;
+ hash %= pm->pm_count;
+ lp = pm->pm_map[hash];
+
+ return (lp->lp_lagg);
+}
+#endif
+
/*
* lacp_suppress_distributing: drop transmit packets for a while
* to preserve packet ordering.
Index: sys/net/if.h
===================================================================
--- sys/net/if.h
+++ sys/net/if.h
@@ -239,6 +239,7 @@
#define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */
#define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
+#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
Index: sys/net/if_dead.c
===================================================================
--- sys/net/if_dead.c
+++ sys/net/if_dead.c
@@ -100,6 +100,30 @@
return (0);
}
+static int
+ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
+{
+ return (EOPNOTSUPP);
+}
+
+static void
+ifdead_snd_tag_free(struct m_snd_tag *pmt)
+{
+}
+
void
if_dead(struct ifnet *ifp)
{
@@ -112,4 +136,8 @@
ifp->if_qflush = ifdead_qflush;
ifp->if_transmit = ifdead_transmit;
ifp->if_get_counter = ifdead_get_counter;
+ ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
+ ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
+ ifp->if_snd_tag_query = ifdead_snd_tag_query;
+ ifp->if_snd_tag_free = ifdead_snd_tag_free;
}
Index: sys/net/if_lagg.c
===================================================================
--- sys/net/if_lagg.c
+++ sys/net/if_lagg.c
@@ -23,6 +23,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -118,6 +119,11 @@
static void lagg_init(void *);
static void lagg_stop(struct lagg_softc *);
static int lagg_ioctl(struct ifnet *, u_long, caddr_t);
+#ifdef RATELIMIT
+static int lagg_snd_tag_alloc(struct ifnet *,
+ union if_snd_tag_alloc_params *,
+ struct m_snd_tag **);
+#endif
static int lagg_ether_setmulti(struct lagg_softc *);
static int lagg_ether_cmdmulti(struct lagg_port *, int);
static int lagg_setflag(struct lagg_port *, int, int,
@@ -503,7 +509,12 @@
ifp->if_ioctl = lagg_ioctl;
ifp->if_get_counter = lagg_get_counter;
ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
+#ifdef RATELIMIT
+ ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
+ ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT;
+#else
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
+#endif
/*
* Attach as an ordinary ethernet device, children will be attached
@@ -1549,6 +1560,52 @@
return (error);
}
+#ifdef RATELIMIT
+static int
+lagg_snd_tag_alloc(struct ifnet *ifp,
+ union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+ struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
+ struct lagg_port *lp;
+ struct lagg_lb *lb;
+ uint32_t p;
+
+ switch (sc->sc_proto) {
+ case LAGG_PROTO_FAILOVER:
+ lp = lagg_link_active(sc, sc->sc_primary);
+ break;
+ case LAGG_PROTO_LOADBALANCE:
+ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+ params->hdr.flowtype == M_HASHTYPE_NONE)
+ return (EOPNOTSUPP);
+ p = params->hdr.flowid >> sc->flowid_shift;
+ p %= sc->sc_count;
+ lb = (struct lagg_lb *)sc->sc_psc;
+ lp = lb->lb_ports[p];
+ lp = lagg_link_active(sc, lp);
+ break;
+ case LAGG_PROTO_LACP:
+ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+ params->hdr.flowtype == M_HASHTYPE_NONE)
+ return (EOPNOTSUPP);
+ lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ if (lp == NULL)
+ return (EOPNOTSUPP);
+ ifp = lp->lp_ifp;
+ if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
+ (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+ return (EOPNOTSUPP);
+
+ /* forward allocation request */
+ return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif
+
static int
lagg_ether_setmulti(struct lagg_softc *sc)
{
Index: sys/net/if_var.h
===================================================================
--- sys/net/if_var.h
+++ sys/net/if_var.h
@@ -175,6 +175,49 @@
#define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */
+/*
+ * Network interface send tag support. The storage of "struct
+ * m_snd_tag" comes from the network driver and it is free to allocate
+ * as much additional space as it wants for its own use.
+ */
+struct m_snd_tag;
+
+#define IF_SND_TAG_TYPE_RATE_LIMIT 0
+#define IF_SND_TAG_TYPE_MAX 1
+
+struct if_snd_tag_alloc_header {
+ uint32_t type; /* send tag type, see IF_SND_TAG_XXX */
+ uint32_t flowid; /* mbuf hash value */
+ uint32_t flowtype; /* mbuf hash type */
+};
+
+struct if_snd_tag_alloc_rate_limit {
+ struct if_snd_tag_alloc_header hdr;
+ uint64_t max_rate; /* in bytes/s */
+};
+
+struct if_snd_tag_rate_limit_params {
+ uint64_t max_rate; /* in bytes/s */
+};
+
+union if_snd_tag_alloc_params {
+ struct if_snd_tag_alloc_header hdr;
+ struct if_snd_tag_alloc_rate_limit rate_limit;
+};
+
+union if_snd_tag_modify_params {
+ struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+union if_snd_tag_query_params {
+ struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
+ struct m_snd_tag **);
+typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
+typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
+typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
/*
* Structure defining a network interface.
@@ -304,12 +347,19 @@
u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */
/*
+ * Network adapter send tag support:
+ */
+ if_snd_tag_alloc_t *if_snd_tag_alloc;
+ if_snd_tag_modify_t *if_snd_tag_modify;
+ if_snd_tag_query_t *if_snd_tag_query;
+ if_snd_tag_free_t *if_snd_tag_free;
+
+ /*
* Spare fields to be added before branching a stable branch, so
* that structure can be enhanced without changing the kernel
* binary interface.
*/
- void *if_pspare[4]; /* packet pacing / general use */
- int if_ispare[4]; /* packet pacing / general use */
+ int if_ispare[4]; /* general use */
};
/* for compatibility with other BSDs */
Index: sys/net/if_vlan.c
===================================================================
--- sys/net/if_vlan.c
+++ sys/net/if_vlan.c
@@ -46,6 +46,7 @@
#include "opt_inet.h"
#include "opt_vlan.h"
+#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/eventhandler.h>
@@ -212,6 +213,10 @@
static void vlan_init(void *foo);
static void vlan_input(struct ifnet *ifp, struct mbuf *m);
static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
+#ifdef RATELIMIT
+static int vlan_snd_tag_alloc(struct ifnet *,
+ union if_snd_tag_alloc_params *, struct m_snd_tag **);
+#endif
static void vlan_qflush(struct ifnet *ifp);
static int vlan_setflag(struct ifnet *ifp, int flag, int status,
int (*func)(struct ifnet *, int));
@@ -971,6 +976,9 @@
ifp->if_transmit = vlan_transmit;
ifp->if_qflush = vlan_qflush;
ifp->if_ioctl = vlan_ioctl;
+#ifdef RATELIMIT
+ ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
+#endif
ifp->if_flags = VLAN_IFFLAGS;
ether_ifattach(ifp, eaddr);
/* Now undo some of the damage... */
@@ -1591,6 +1599,15 @@
TOEDEV(ifp) = TOEDEV(p);
ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
}
+
+#ifdef RATELIMIT
+ /*
+ * If the parent interface supports ratelimiting, so does the
+ * VLAN interface.
+ */
+ ifp->if_capabilities |= (p->if_capabilities & IFCAP_TXRTLMT);
+ ifp->if_capenable |= (p->if_capenable & IFCAP_TXRTLMT);
+#endif
}
static void
@@ -1801,3 +1818,19 @@
return (error);
}
+
+#ifdef RATELIMIT
+static int
+vlan_snd_tag_alloc(struct ifnet *ifp,
+ union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+
+ /* get trunk device */
+ ifp = vlan_trunkdev(ifp);
+ if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+ return (EOPNOTSUPP);
+ /* forward allocation request */
+ return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -181,6 +181,7 @@
* read-lock usage during modification, this model can be applied to other
* protocols (especially SCTP).
*/
+struct m_snd_tag;
struct inpcb {
LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
@@ -202,11 +203,11 @@
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
u_int inp_refcount; /* (i) refcount */
- void *inp_pspare[5]; /* (x) packet pacing / general use */
+ struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */
+ void *inp_pspare[4]; /* (x) general use */
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
- u_int inp_ispare[4]; /* (x) packet pacing / user cookie /
- * general use */
+ u_int inp_ispare[4]; /* (x) user cookie / general use */
/* Local and foreign ports, local and foreign addr. */
struct in_conninfo inp_inc; /* (i) list for PCB's local port */
@@ -616,6 +617,7 @@
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
+#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
/*
* Flags passed to in_pcblookup*() functions.
@@ -736,6 +738,14 @@
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
+#ifdef RATELIMIT
+int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+void in_pcbdetach_txrtlmt(struct inpcb *);
+int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
+int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
+void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
+void in_pcboutput_eagain(struct inpcb *);
+#endif
#endif /* _KERNEL */
#endif /* !_NETINET_IN_PCB_H_ */
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -42,6 +42,7 @@
#include "opt_ipsec.h"
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_pcbgroup.h"
#include "opt_rss.h"
@@ -57,6 +58,7 @@
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/sockio.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
@@ -1140,6 +1142,10 @@
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
+#ifdef RATELIMIT
+ if (inp->inp_snd_tag != NULL)
+ in_pcbdetach_txrtlmt(inp);
+#endif
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
}
@@ -2677,3 +2683,253 @@
db_print_inpcb(inp, "inpcb", 0);
}
#endif /* DDB */
+
+#ifdef RATELIMIT
+/*
+ * Modify TX rate limit based on the existing "inp->inp_snd_tag",
+ * if any.
+ */
+int
+in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
+{
+ union if_snd_tag_modify_params params = {
+ .rate_limit.max_rate = max_pacing_rate,
+ };
+ struct m_snd_tag *mst;
+ struct ifnet *ifp;
+ int error;
+
+ mst = inp->inp_snd_tag;
+ if (mst == NULL)
+ return (EINVAL);
+
+ ifp = mst->ifp;
+ if (ifp == NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd_tag_modify == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_modify(mst, ¶ms);
+ }
+ return (error);
+}
+
+/*
+ * Query existing TX rate limit based on the existing
+ * "inp->inp_snd_tag", if any.
+ */
+int
+in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
+{
+ union if_snd_tag_query_params params = { };
+ struct m_snd_tag *mst;
+ struct ifnet *ifp;
+ int error;
+
+ mst = inp->inp_snd_tag;
+ if (mst == NULL)
+ return (EINVAL);
+
+ ifp = mst->ifp;
+ if (ifp == NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd_tag_query == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_query(mst, ¶ms);
+ if (error == 0 && p_max_pacing_rate != NULL)
+ *p_max_pacing_rate = params.rate_limit.max_rate;
+ }
+ return (error);
+}
+
+/*
+ * Allocate a new TX rate limit send tag from the network interface
+ * given by the "ifp" argument and save it in "inp->inp_snd_tag":
+ */
+int
+in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
+ uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+{
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = flowid,
+ .rate_limit.hdr.flowtype = flowtype,
+ .rate_limit.max_rate = max_pacing_rate,
+ };
+ int error;
+
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_snd_tag != NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd_tag_alloc == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
+
+ /*
+ * At success increment the refcount on
+ * the send tag's network interface:
+ */
+ if (error == 0)
+ if_ref(inp->inp_snd_tag->ifp);
+ }
+ return (error);
+}
+
+/*
+ * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
+ * if any:
+ */
+void
+in_pcbdetach_txrtlmt(struct inpcb *inp)
+{
+ struct m_snd_tag *mst;
+ struct ifnet *ifp;
+
+ INP_WLOCK_ASSERT(inp);
+
+ mst = inp->inp_snd_tag;
+ inp->inp_snd_tag = NULL;
+
+ if (mst == NULL)
+ return;
+
+ ifp = mst->ifp;
+ if (ifp == NULL)
+ return;
+
+ /*
+ * If the device was detached while we still had reference(s)
+ * on the ifp, we assume if_snd_tag_free() was replaced with
+ * stubs.
+ */
+ ifp->if_snd_tag_free(mst);
+
+ /* release reference count on network interface */
+ if_rele(ifp);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+ struct socket *socket;
+ uint32_t max_pacing_rate;
+ bool did_upgrade;
+ int error;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /*
+ * NOTE: The so_max_pacing_rate value is read unlocked,
+ * because atomic updates are not required since the variable
+ * is checked at every mbuf we send. It is assumed that the
+ * variable read itself will be atomic.
+ */
+ max_pacing_rate = socket->so_max_pacing_rate;
+
+ /*
+ * NOTE: When attaching to a network interface a reference is
+ * made to ensure the network interface doesn't go away until
+ * all ratelimit connections are gone. The network interface
+ * pointers compared below represent valid network interfaces,
+ * except when comparing towards NULL.
+ */
+ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
+ error = 0;
+ } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
+ if (inp->inp_snd_tag != NULL)
+ in_pcbdetach_txrtlmt(inp);
+ error = 0;
+ } else if (inp->inp_snd_tag == NULL) {
+ /*
+ * In order to utilize packet pacing with RSS, we need
+ * to wait until there is a valid RSS hash before we
+ * can proceed:
+ */
+ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
+ error = EAGAIN;
+ } else {
+ error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
+ mb->m_pkthdr.flowid, max_pacing_rate);
+ }
+ } else {
+ error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
+ }
+ if (error == 0 || error == EOPNOTSUPP)
+ inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+ if (did_upgrade)
+ INP_DOWNGRADE(inp);
+}
+
+/*
+ * Track route changes for TX rate limiting.
+ */
+void
+in_pcboutput_eagain(struct inpcb *inp)
+{
+ struct socket *socket;
+ bool did_upgrade;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (inp->inp_snd_tag == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /* detach rate limiting */
+ in_pcbdetach_txrtlmt(inp);
+
+ /* make sure new mbuf send tag allocation is made */
+ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+
+ if (did_upgrade)
+ INP_DOWNGRADE(inp);
+}
+#endif /* RATELIMIT */
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -33,6 +33,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
+#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
@@ -661,8 +662,23 @@
*/
m_clrprotoflags(m);
IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
goto done;
}
@@ -698,8 +714,23 @@
IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
mtod(m, struct ip *), NULL);
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
} else
m_freem(m);
}
@@ -974,6 +1005,16 @@
INP_WUNLOCK(inp);
error = 0;
break;
+ case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+ INP_WLOCK(inp);
+ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+ INP_WUNLOCK(inp);
+ error = 0;
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
default:
break;
}
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -65,6 +65,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_sctp.h"
#include "opt_route.h"
@@ -954,8 +955,23 @@
m->m_pkthdr.len);
ifa_free(&ia6->ia_ifa);
}
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
goto done;
}
@@ -1054,8 +1070,23 @@
counter_u64_add(ia->ia_ifa.ifa_obytes,
m->m_pkthdr.len);
}
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
} else
m_freem(m);
}
@@ -1441,6 +1472,16 @@
INP_WUNLOCK(in6p);
error = 0;
break;
+ case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+ INP_WLOCK(in6p);
+ in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+ INP_WUNLOCK(in6p);
+ error = 0;
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
default:
break;
}
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h
+++ sys/sys/mbuf.h
@@ -130,6 +130,14 @@
};
/*
+ * Static network interface owned tag.
+ * Allocated through ifp->if_snd_tag_alloc().
+ */
+struct m_snd_tag {
+ struct ifnet *ifp; /* network interface tag belongs to */
+};
+
+/*
* Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
* Size ILP32: 48
* LP64: 56
@@ -137,7 +145,10 @@
* they are correct.
*/
struct pkthdr {
- struct ifnet *rcvif; /* rcv interface */
+ union {
+ struct m_snd_tag *snd_tag; /* send tag, if any */
+ struct ifnet *rcvif; /* rcv interface */
+ };
SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
int32_t len; /* total packet length */
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -159,6 +159,7 @@
#define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */
#define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */
#define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */
+#define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */
#endif
#if __BSD_VISIBLE
Index: sys/sys/socketvar.h
===================================================================
--- sys/sys/socketvar.h
+++ sys/sys/socketvar.h
@@ -128,9 +128,10 @@
uint32_t so_user_cookie;
int so_ts_clock; /* type of the clock used for timestamps */
+ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */
- void *so_pspare[2]; /* packet pacing / general use */
- int so_ispare[2]; /* packet pacing / general use */
+ void *so_pspare[2]; /* general use */
+ int so_ispare[2]; /* general use */
};
/*
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Apr 8, 3:28 AM (8 h, 54 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28297519
Default Alt Text
D3687.1775618921.diff (28 KB)
Attached To
Mode
D3687: Implement kernel support for hardware rate limited sockets
Attached
Detach File
Event Timeline
Log In to Comment