Page MenuHomeFreeBSD

D3687.1778202330.diff
No OneTemporary

Size
17 KB
Referenced Files
None
Subscribers
None

D3687.1778202330.diff

Index: lib/libc/sys/getsockopt.2
===================================================================
--- lib/libc/sys/getsockopt.2
+++ lib/libc/sys/getsockopt.2
@@ -28,7 +28,7 @@
.\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95
.\" $FreeBSD$
.\"
-.Dd April 5, 2013
+.Dd August 4, 2016
.Dt GETSOCKOPT 2
.Os
.Sh NAME
@@ -187,6 +187,7 @@
.It Dv SO_LISTENQLEN Ta "get complete queue length of the socket (get only)"
.It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)"
.It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)"
+.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket"
.El
.Pp
.Dv SO_DEBUG
@@ -496,6 +497,11 @@
returns the number of unaccepted complete connections.
.Dv SO_LISTENINCQLEN
returns the number of unaccepted incomplete connections.
+.Pp
+.Dv SO_MAX_PACING_RATE
+instructs the socket and underlying network adapter layers that the
+transmit rate should be limited to the given unsigned 32-bit value in
+bytes per second.
.Sh RETURN VALUES
.Rv -std
.Sh ERRORS
Index: sbin/ifconfig/ifconfig.8
===================================================================
--- sbin/ifconfig/ifconfig.8
+++ sbin/ifconfig/ifconfig.8
@@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
-.Dd June 8, 2016
+.Dd August 2, 2016
.Dt IFCONFIG 8
.Os
.Sh NAME
@@ -454,6 +454,8 @@
and 802.11g
.Pq Cm 11g
operating modes.
+.It Cm txrtlmt
+Set if the driver supports TX rate limiting.
.It Cm inst Ar minst , Cm instance Ar minst
Set the media instance to
.Ar minst .
Index: sbin/ifconfig/ifconfig.c
===================================================================
--- sbin/ifconfig/ifconfig.c
+++ sbin/ifconfig/ifconfig.c
@@ -1145,7 +1145,7 @@
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT"
/*
* Print the status of the interface. If an address family was
@@ -1453,6 +1453,8 @@
DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap),
DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap),
DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap),
+ DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap),
+ DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap),
DEF_CMD("normal", -IFF_LINK0, setifflags),
DEF_CMD("compress", IFF_LINK0, setifflags),
DEF_CMD("noicmp", IFF_LINK1, setifflags),
Index: sys/conf/NOTES
===================================================================
--- sys/conf/NOTES
+++ sys/conf/NOTES
@@ -616,6 +616,8 @@
options INET #Internet communications protocols
options INET6 #IPv6 communications protocols
+options RATELIMIT # TX rate limiting support
+
options ROUTETABLES=2 # allocated fibs up to 65536. default is 1.
# but that would be a bad idea as they are large.
Index: sys/conf/config.mk
===================================================================
--- sys/conf/config.mk
+++ sys/conf/config.mk
@@ -19,6 +19,10 @@
opt_inet6.h:
@echo "#define INET6 1" > ${.TARGET}
.endif
+.if ${MK_RATELIMIT} != "no"
+opt_ratelimit.h:
+ @echo "#define RATELIMIT 1" > ${.TARGET}
+.endif
.if ${MK_EISA} != "no"
opt_eisa.h:
@echo "#define DEV_EISA 1" > ${.TARGET}
Index: sys/conf/kern.opts.mk
===================================================================
--- sys/conf/kern.opts.mk
+++ sys/conf/kern.opts.mk
@@ -47,7 +47,8 @@
EISA \
EXTRA_TCP_STACKS \
NAND \
- OFED
+ OFED \
+ RATELIMIT
# Some options are totally broken on some architectures. We disable
# them. If you need to enable them on an experimental basis, you
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -408,6 +408,7 @@
BOOTP_WIRED_TO opt_bootp.h
DEVICE_POLLING
DUMMYNET opt_ipdn.h
+RATELIMIT opt_ratelimit.h
INET opt_inet.h
INET6 opt_inet6.h
IPDIVERT
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -105,6 +105,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_compat.h"
#include <sys/param.h>
@@ -2678,6 +2679,18 @@
#endif
break;
+ case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+ error = sooptcopyin(sopt, &val32, sizeof(val32),
+ sizeof(val32));
+ if (error)
+ goto bad;
+ so->so_max_pacing_rate = val32;
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
+
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
@@ -2733,6 +2746,9 @@
#ifdef MAC
struct mac extmac;
#endif
+#ifdef RATELIMIT
+ uint32_t val32;
+#endif
CURVNET_SET(so->so_vnet);
error = 0;
@@ -2865,6 +2881,15 @@
optval = so->so_incqlen;
goto integer;
+ case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+ val32 = so->so_max_pacing_rate;
+ error = sooptcopyout(sopt, &val32, sizeof(val32));
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
+
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
Index: sys/net/if.h
===================================================================
--- sys/net/if.h
+++ sys/net/if.h
@@ -239,6 +239,7 @@
#define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */
#define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
+#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
@@ -371,6 +372,16 @@
};
/*
+ * Interface to create/delete/modify/get TX rate limiting.
+ */
+struct ifreq_txrtlmt {
+ uint32_t txringid_max_rate;
+ uint32_t txringid;
+ uint32_t txringid_flowid;
+ uint32_t txringid_flowtype;
+};
+
+/*
* Interface request structure used for socket
* ioctl's. All interface ioctl's must have parameter
* definitions which begin with ifr_name. The
Index: sys/net/if.c
===================================================================
--- sys/net/if.c
+++ sys/net/if.c
@@ -2767,6 +2767,17 @@
ifr = (struct ifreq *)data;
switch (cmd) {
+ /*
+ * The TX rate limiting IOCTLs should only be used
+ * within the kernel. Prevent user-space from using
+ * them:
+ */
+ case SIOCARATECTL:
+ case SIOCSRATECTL:
+ case SIOCDRATECTL:
+ CURVNET_RESTORE();
+ return (EOPNOTSUPP);
+
#ifdef VIMAGE
case SIOCSIFRVNET:
error = priv_check(td, PRIV_NET_SETIFVNET);
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -202,10 +202,13 @@
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
u_int inp_refcount; /* (i) refcount */
- void *inp_pspare[5]; /* (x) packet pacing / general use */
+ struct ifnet *inp_txringid_ifp; /* (i) ifp of TX ring ID */
+ void *inp_pspare[4]; /* (x) packet pacing / general use */
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
- u_int inp_ispare[4]; /* (x) packet pacing / user cookie /
+ uint32_t inp_txringid_max_rate; /* (i) driver TX ring ID rate */
+ uint32_t inp_txringid; /* (i) driver TX ring ID */
+ u_int inp_ispare[2]; /* (x) packet pacing / user cookie /
* general use */
/* Local and foreign ports, local and foreign addr. */
@@ -736,6 +739,9 @@
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
+#ifdef RATELIMIT
+void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
+#endif
#endif /* _KERNEL */
#endif /* !_NETINET_IN_PCB_H_ */
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -42,6 +42,7 @@
#include "opt_ipsec.h"
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_pcbgroup.h"
#include "opt_rss.h"
@@ -57,6 +58,7 @@
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/sockio.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
@@ -136,6 +138,9 @@
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
static void in_pcbremlists(struct inpcb *inp);
+#ifdef RATELIMIT
+static void in_pcbdetach_txrtlmt(struct inpcb *inp);
+#endif
#ifdef INET
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
@@ -1140,6 +1145,10 @@
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
+#ifdef RATELIMIT
+ if (inp->inp_txringid_ifp != NULL)
+ in_pcbdetach_txrtlmt(inp);
+#endif
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
}
@@ -2683,3 +2692,193 @@
db_print_inpcb(inp, "inpcb", 0);
}
#endif /* DDB */
+
+#ifdef RATELIMIT
+/*
+ * Modify existing TX rate limit on inp_txringid_ifp and update
+ * inpcb info:
+ */
+static int
+in_pcbmodify_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
+ uint32_t max_pacing_rate)
+{
+ struct ifreq_txrtlmt req;
+ int error;
+
+ INP_WLOCK_ASSERT(inp);
+
+ req.txringid_max_rate = max_pacing_rate;
+ req.txringid = inp->inp_txringid;
+ req.txringid_flowid = inp->inp_flowid;
+ req.txringid_flowtype = inp->inp_flowtype;
+
+ error = ifp->if_ioctl(ifp, SIOCSRATECTL, (caddr_t)&req);
+
+ if (error)
+ return (error);
+
+ inp->inp_txringid_max_rate = max_pacing_rate;
+ return (0);
+}
+
+/*
+ * Create a TX rate limit on ifp and attach it to inpcb:
+ */
+static int
+in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
+ uint32_t max_pacing_rate)
+{
+ struct ifreq_txrtlmt req;
+ int error;
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(inp->inp_txringid_ifp == NULL,
+ ("%s: inp_txringid_ifp != NULL", __func__));
+
+ req.txringid_max_rate = max_pacing_rate;
+ req.txringid_flowid = inp->inp_flowid;
+ req.txringid_flowtype = inp->inp_flowtype;
+
+ if_ref(ifp);
+ error = ifp->if_ioctl(ifp, SIOCARATECTL, (caddr_t)&req);
+
+ if (error) {
+ if_rele(ifp);
+ return (error);
+ }
+
+ inp->inp_txringid_ifp = ifp;
+ inp->inp_txringid_max_rate = max_pacing_rate;
+ inp->inp_txringid = req.txringid;
+ return (0);
+}
+
+/*
+ * Remove TX rate limit from inp_txringid_ifp and detach it from
+ * the inpcb:
+ */
+static void
+in_pcbdetach_txrtlmt(struct inpcb *inp)
+{
+ struct ifreq_txrtlmt req;
+ struct ifnet *ifp;
+
+ INP_WLOCK_ASSERT(inp);
+
+ KASSERT(inp->inp_txringid_ifp != NULL,
+ ("%s: inp->inp_txringid_ifp == NULL", __func__));
+
+ ifp = inp->inp_txringid_ifp;
+ req.txringid = inp->inp_txringid;
+ req.txringid_flowid = inp->inp_flowid;
+ req.txringid_flowtype = inp->inp_flowtype;
+
+ inp->inp_txringid_ifp = NULL;
+ inp->inp_txringid = 0;
+ inp->inp_txringid_max_rate = 0;
+
+ /*
+ * If the device was detached while we still had reference on
+ * ifp, we assume if_dead() was called and replaced callbacks
+ * with stubs.
+ */
+ ifp->if_ioctl(ifp, SIOCDRATECTL, (caddr_t)&req);
+ if_rele(ifp);
+}
+
+/*
+ * Track route changes and modify the TX rate limit hint in the given
+ * mbuf to match what the network driver expects.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+ struct socket *socket;
+ uint32_t max_pacing_rate;
+ int error;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ /*
+ * NOTE: The so_max_pacing_rate value is read unlocked,
+ * because atomic updates are not required since the variable
+ * is checked at every mbuf we send. It is assumed that the
+ * variable read itself will be atomic.
+ */
+ max_pacing_rate = socket->so_max_pacing_rate;
+
+ if (max_pacing_rate == 0 && inp->inp_txringid_ifp == NULL)
+ return;
+
+ /*
+ * In order to utilize packet pacing with RSS, we need to wait
+ * until there is a valid RSS hash before we can proceed:
+ */
+ if (inp->inp_flowtype == M_HASHTYPE_NONE)
+ return;
+
+ /*
+ * NOTE: If we have a referece on the ifp, a new ifp can't be
+ * created at the same memory address of the old ifp. This
+ * lets us insure that if we transmit on one interface and its
+ * module is unloaded and then loaded, we won't try to
+ * transmit on an invalid ring on the new ifp, but first we
+ * delete the ring on the old ifp, and then will create a new
+ * one on the new ifp.
+ */
+ if (ifp != inp->inp_txringid_ifp) {
+ bool wlocked = INP_WLOCKED(inp);
+
+ if (!wlocked) {
+ /*
+ * NOTE: If the write locking fails, we need
+ * to bail out and use the non-ratelimited
+ * ring for the transmit until there is a new
+ * chance to write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ }
+
+ if (inp->inp_txringid_ifp != NULL)
+ in_pcbdetach_txrtlmt(inp);
+
+ error = in_pcbattach_txrtlmt(inp, ifp, max_pacing_rate);
+
+ if (!wlocked)
+ INP_DOWNGRADE(inp);
+ if (error)
+ return;
+
+ } else if (inp->inp_txringid_max_rate != max_pacing_rate) {
+ bool wlocked = INP_WLOCKED(inp);
+
+ if (!wlocked) {
+ /*
+ * NOTE: If the write locking fails, use the
+ * current pacing rate until there is a new
+ * chance to write lock:
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ goto done;
+ }
+
+ error = in_pcbmodify_txrtlmt(inp, ifp, max_pacing_rate);
+ if (!wlocked)
+ INP_DOWNGRADE(inp);
+ if (error)
+ goto done; /* use old rate */
+ }
+done:
+ /*
+ * Update the flow ID and RSS hash for the transmitted mbuf.
+ */
+ mb->m_pkthdr.flowid = inp->inp_txringid;
+ M_HASHTYPE_SET(mb, M_HASHTYPE_TXRTLMT);
+}
+#endif /* RATELIMIT */
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -33,6 +33,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
+#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
@@ -657,7 +658,12 @@
* to avoid confusing lower layers.
*/
m_clrprotoflags(m);
+
IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
+#ifdef RATELIMIT
+ if (ifp->if_capabilities & IFCAP_TXRTLMT)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
goto done;
@@ -694,6 +700,10 @@
m_clrprotoflags(m);
IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
+#ifdef RATELIMIT
+ if (ifp->if_capabilities & IFCAP_TXRTLMT)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
} else
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -65,6 +65,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_sctp.h"
#include "opt_route.h"
@@ -944,6 +945,10 @@
m->m_pkthdr.len);
ifa_free(&ia6->ia_ifa);
}
+#ifdef RATELIMIT
+ if (ifp->if_capabilities & IFCAP_TXRTLMT)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
goto done;
@@ -1044,6 +1049,10 @@
counter_u64_add(ia->ia_ifa.ifa_obytes,
m->m_pkthdr.len);
}
+#ifdef RATELIMIT
+ if (ifp->if_capabilities & IFCAP_TXRTLMT)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
} else
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h
+++ sys/sys/mbuf.h
@@ -344,6 +344,7 @@
#define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple +
* ext hdrs */
+#define M_HASHTYPE_TXRTLMT 62 /* rate limited TX traffic */
#define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */
#define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE)
/* ordering+hash, not affinity*/
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -158,6 +158,7 @@
#define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */
#define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */
#define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */
+#define SO_MAX_PACING_RATE 0x1017 /* set max TX pacing rate per socket */
#endif
/*
Index: sys/sys/socketvar.h
===================================================================
--- sys/sys/socketvar.h
+++ sys/sys/socketvar.h
@@ -79,6 +79,7 @@
void *so_pcb; /* protocol control block */
struct vnet *so_vnet; /* (a) network stack instance */
struct protosw *so_proto; /* (a) protocol handle */
+ uint32_t so_max_pacing_rate; /* (f) TX pacing rate info */
/*
* Variables for connection queuing.
* Socket where accepts occur is so_head in all subsidiary sockets.
Index: sys/sys/sockio.h
===================================================================
--- sys/sys/sockio.h
+++ sys/sys/sockio.h
@@ -133,4 +133,8 @@
#define SIOCGIFGMEMB _IOWR('i', 138, struct ifgroupreq) /* get members */
#define SIOCGIFXMEDIA _IOWR('i', 139, struct ifmediareq) /* get net xmedia */
+#define SIOCARATECTL _IOWR('i', 140, struct ifreq_txrtlmt) /* add tx rate limit */
+#define SIOCSRATECTL _IOWR('i', 141, struct ifreq_txrtlmt) /* set tx rate limit */
+#define SIOCDRATECTL _IOW('i', 142, struct ifreq_txrtlmt) /* del tx rate limit */
+
#endif /* !_SYS_SOCKIO_H_ */

File Metadata

Mime Type
text/plain
Expires
Fri, May 8, 1:05 AM (18 h, 6 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28584075
Default Alt Text
D3687.1778202330.diff (17 KB)

Event Timeline