Index: sys/modules/mlx4/Makefile =================================================================== --- sys/modules/mlx4/Makefile +++ sys/modules/mlx4/Makefile @@ -13,6 +13,11 @@ CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4 CFLAGS+= -I${.CURDIR}/../../ofed/include/ +.if defined(CONFIG_RATELIMIT) +CFLAGS += -DCONFIG_RATELIMIT=1 +CFLAGS += -DCONFIG_WQE_FORMAT_1=1 +.endif + .include CFLAGS+= -Wno-cast-qual -Wno-pointer-arith Index: sys/modules/mlxen/Makefile =================================================================== --- sys/modules/mlxen/Makefile +++ sys/modules/mlxen/Makefile @@ -9,6 +9,11 @@ CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4 CFLAGS+= -I${.CURDIR}/../../ofed/include/ +.if defined(CONFIG_RATELIMIT) +CFLAGS += -DCONFIG_RATELIMIT=1 +CFLAGS += -DCONFIG_WQE_FORMAT_1=1 +.endif + .include CFLAGS+= -Wno-cast-qual -Wno-pointer-arith Index: sys/ofed/drivers/net/mlx4/en_main.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_main.c +++ sys/ofed/drivers/net/mlx4/en_main.c @@ -48,6 +48,9 @@ #ifdef __linux__ MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")"); #endif +#ifdef CONFIG_RATELIMIT +#include +#endif static const char mlx4_en_version[] = DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v" @@ -76,6 +79,16 @@ #define MAX_PFC_TX 0xff #define MAX_PFC_RX 0xff +#ifdef CONFIG_RATELIMIT +/* User can use up to MLX4_NUM_PRIORITIES number of priorities */ +#define PRIOS_BUFF_SIZE (MLX4_NUM_PRIORITIES * 3) +static char prios_for_rl[PRIOS_BUFF_SIZE] = {0}; +TUNABLE_STR("hw.mlx4_en.config_prios_for_rl_rings", prios_for_rl, sizeof(prios_for_rl)); + +/* Sysctl rate limit ring info will be shown if this variable was set */ +int show_rl_sysctl_info = 0; +TUNABLE_INT("rate.limit.debug", &show_rl_sysctl_info); +#endif static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) { @@ -226,6 +239,15 @@ goto err_mr; } +#ifdef CONFIG_RATELIMIT + if (dev->caps.rl_caps.enable) { + mdev->num_rl_prios = mlx4_parse_prios_for_rl(prios_for_rl, &mdev->lst_of_prios, MLX4_NUM_PRIORITIES); + for (i = 0; i < MLX4_NUM_PRIORITIES; i++) + if ((mdev->lst_of_prios & (1 << i)) != 0) + mlx4_info(mdev, "Rate limit supports priority: %d\n", i); + } +#endif + /* Configure which ports to start according to module parameters */ mdev->port_cnt = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) Index: sys/ofed/drivers/net/mlx4/en_netdev.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_netdev.c +++ sys/ofed/drivers/net/mlx4/en_netdev.c @@ -49,6 +49,11 @@ #include #include +#ifdef CONFIG_RATELIMIT +#include +#include +#endif + #include "mlx4_en.h" #include "en_port.h" @@ -1059,7 +1064,11 @@ priv->last_moder_bytes[i] = 0; } +#ifdef CONFIG_RATELIMIT + for (i = 0; i < priv->native_tx_ring_num; i++) { +#else for (i = 0; i < priv->tx_ring_num; i++) { +#endif cq = priv->tx_cq[i]; cq->moder_cnt = priv->tx_frames; cq->moder_time = priv->tx_usecs; @@ -1305,6 +1314,10 @@ /* Configure tx cq's and rings */ for (i = 0; i < priv->tx_ring_num; i++) { +#ifdef CONFIG_RATELIMIT + if (!TX_RING_USER_VALID(i)) + continue; +#endif /* Configure cq */ cq = priv->tx_cq[i]; err = mlx4_en_activate_cq(priv, cq, i); @@ -1329,6 +1342,15 @@ if (err) { en_err(priv, "Failed activating Tx ring %d\n", i); mlx4_en_deactivate_cq(priv, cq); +#ifdef CONFIG_RATELIMIT + if (i >= priv->native_tx_ring_num) { + /* Rate limit ring - no need for err flow*/ + mlx4_en_invalidate_rl_ring(priv, i); + mlx4_en_rl_reused_index_insert(priv, i); + ++tx_index; + continue; + } +#endif goto tx_err; } @@ -1494,13 +1516,22 @@ /* Free TX Rings */ for (i = 0; i < priv->tx_ring_num; i++) { +#ifdef CONFIG_RATELIMIT + if (!TX_RING_USER_VALID(i)) + continue; +#endif mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[i]); mlx4_en_deactivate_cq(priv, priv->tx_cq[i]); } msleep(10); - for (i = 0; i < priv->tx_ring_num; i++) + for (i = 0; i < priv->tx_ring_num; i++){ +#ifdef CONFIG_RATELIMIT + if (!TX_RING_USER_VALID(i)) + continue; +#endif mlx4_en_free_tx_buf(dev, priv->tx_ring[i]); + } /* Free RSS qps */ mlx4_en_release_rss_steer(priv); @@ -1534,6 +1565,10 @@ if (priv->blocked == 0 || priv->port_up == 0) return; for (i = 0; i < priv->tx_ring_num; i++) { +#ifdef CONFIG_RATELIMIT + if (!TX_RING_USER_VALID(i)) + continue; +#endif ring = priv->tx_ring[i]; if (ring->blocked && ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks) @@ -1572,6 +1607,10 @@ memset(&priv->vport_stats, 0, sizeof(priv->vport_stats)); for (i = 0; i < priv->tx_ring_num; i++) { +#ifdef CONFIG_RATELIMIT + if (!TX_RING_USER_VALID(i)) + continue; +#endif priv->tx_ring[i]->bytes = 0; priv->tx_ring[i]->packets = 0; priv->tx_ring[i]->tx_csum = 0; @@ -1627,7 +1666,11 @@ } #endif +#ifdef CONFIG_RATELIMIT + for (i = 0; i < priv->native_tx_ring_num; i++) { +#else for (i = 0; i < priv->tx_ring_num; i++) { +#endif if (priv->tx_ring && priv->tx_ring[i]) mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); if (priv->tx_cq && priv->tx_cq[i]) @@ -1645,8 +1688,27 @@ if (priv->sysctl) sysctl_ctx_free(&priv->stat_ctx); +} + +#ifdef CONFIG_RATELIMIT +static void mlx4_en_free_rl_resources(struct mlx4_en_priv *priv) +{ + int i; + struct mlx4_en_tx_ring *ring; + for (i = priv->native_tx_ring_num; i < priv->tx_ring_num; i++) { + if (priv->tx_ring && priv->tx_ring[i]) { + ring = priv->tx_ring[i]; + if (ring->rl_data.user_valid == true) { + sysctl_ctx_free(&ring->rl_data.rl_stats_ctx); + } + mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); + } + if (priv->tx_cq && priv->tx_cq[i]) + mlx4_en_destroy_cq(priv, &priv->tx_cq[i]); + } } +#endif int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) { @@ -1666,7 +1728,11 @@ } /* Create tx Rings */ +#ifdef CONFIG_RATELIMIT + for (i = 0; i < priv->native_tx_ring_num; i++) { +#else for (i = 0; i < priv->tx_ring_num; i++) { +#endif if (mlx4_en_create_cq(priv, &priv->tx_cq[i], prof->tx_ring_size, i, TX, node)) goto err; @@ -1695,7 +1761,11 @@ if (priv->rx_cq[i]) mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); } +#ifdef CONFIG_RATELIMIT + for (i = 0; i < priv->native_tx_ring_num; i++) { +#else for (i = 0; i < priv->tx_ring_num; i++) { +#endif if (priv->tx_ring[i]) mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); if (priv->tx_cq[i]) @@ -1736,6 +1806,16 @@ if (priv->allocated) mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE); +#ifdef CONFIG_RATELIMIT + /* Need to make sure there are no rl related tasks(1) + * and more won't be added before we run mlx4_en_stop_port(2). + * 1 - We get because we drain and free the tq below. + * 2 - We get becase we called ether_ifdetach which replaces + * the ioctl callback, so new tasks won't be added. + */ + taskqueue_drain(priv->rl_tq, &priv->rl_task); + taskqueue_free(priv->rl_tq); +#endif mutex_lock(&mdev->state_lock); mlx4_en_stop_port(dev); mutex_unlock(&mdev->state_lock); @@ -1752,6 +1832,9 @@ mdev->pndev[priv->port] = NULL; mutex_unlock(&mdev->state_lock); +#ifdef CONFIG_RATELIMIT + mlx4_en_free_rl_resources(priv); +#endif mlx4_en_free_resources(priv); @@ -1759,6 +1842,9 @@ if (priv->sysctl) sysctl_ctx_free(&priv->conf_ctx); +#ifdef CONFIG_RATELIMIT + kfree(priv->rate_limits); +#endif kfree(priv->tx_ring); kfree(priv->tx_cq); @@ -1906,6 +1992,9 @@ struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; struct ifreq *ifr; +#ifdef CONFIG_RATELIMIT + struct ifreq_txrtlmt *rl_req; +#endif int error; int mask; @@ -1967,6 +2056,20 @@ mutex_unlock(&mdev->state_lock); VLAN_CAPABILITIES(dev); break; +#ifdef CONFIG_RATELIMIT + case SIOCARATECTL: + rl_req = (struct ifreq_txrtlmt *)data; + error = mlx4_en_create_rate_limit_ring(priv, rl_req); + break; + case SIOCSRATECTL: + rl_req = (struct ifreq_txrtlmt *)data; + error = mlx4_en_modify_rate_limit_ring(priv, rl_req); + break; + case SIOCDRATECTL: + rl_req = (struct ifreq_txrtlmt *)data; + error = mlx4_en_destroy_rate_limit_ring(priv, rl_req); + break; +#endif case SIOCGI2C: { struct ifi2creq i2c; @@ -2007,6 +2110,9 @@ uint8_t dev_addr[ETHER_ADDR_LEN]; int err; int i; +#ifdef CONFIG_RATELIMIT + struct mlx4_num_of_rates all_num_rates; +#endif priv = kzalloc(sizeof(*priv), GFP_KERNEL); dev = priv->dev = if_alloc(IFT_ETHER); @@ -2030,6 +2136,9 @@ */ priv->counter_index = 0xff; spin_lock_init(&priv->stats_lock); +#ifdef CONFIG_RATELIMIT + mutex_init(&priv->rate_limit_table_lock); +#endif INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode); INIT_WORK(&priv->watchdog_task, mlx4_en_restart); INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); @@ -2054,6 +2163,15 @@ priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up; priv->tx_ring_num = prof->tx_ring_num; + +#ifdef CONFIG_RATELIMIT + /* Save number of non RL tx rings */ + priv->native_tx_ring_num = priv->tx_ring_num; + priv->rate_limit_tx_ring_num = 0; + priv->next_free_rl_index = 1; + priv->new_rate_for_avail_index = 0; +#endif + priv->tx_ring = kcalloc(MAX_TX_RINGS, sizeof(struct mlx4_en_tx_ring *), GFP_KERNEL); if (!priv->tx_ring) { @@ -2089,6 +2207,53 @@ for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) INIT_HLIST_HEAD(&priv->mac_hash[i]); +#ifdef CONFIG_RATELIMIT + if (mdev->dev->caps.rl_caps.enable) { + memset(&all_num_rates, 0, sizeof(all_num_rates)); + /* Query total number of rates */ + err = mlx4_query_rl_fw_resources(mdev->dev, port, + &all_num_rates.available_RPP); + if (!err) { + /* Set number of rates per prioroty */ + if (mdev->num_rl_prios) + priv->num_rates_per_prio = + all_num_rates.available_RPP/mdev->num_rl_prios; + /* + * Adding one to priv->num_rates_per_prio because index zero + * is used for the regular SQs, + * Therefore FW can recieve 1-120 new different rates + */ + priv->rate_limits = (struct mlx4_en_rate_limit_indexes *) kzalloc((priv->num_rates_per_prio + 1) + * sizeof(struct mlx4_en_rate_limit_indexes), GFP_KERNEL); + for (i = 0; i < MLX4_NUM_PRIORITIES; i++) { + if (mdev->lst_of_prios & (1 << i)) { + all_num_rates.RPP_per_prio[i] = + priv->num_rates_per_prio; + } + } + /* + * Write base QP to the fw, fw will compute the amount of needed + * QPs starting from this base QP + */ + all_num_rates.base_qp_num = mdev->dev->caps.fw_reserved_qp_base; + err = mlx4_alloc_rl_fw_resources(mdev->dev, port, &all_num_rates); + if (err) { + en_err(priv, "Couldn't set available number of rates per prio for port %d\n", port); + en_err(priv, "Couldn't set the amount of reserved qps needed by the FW \n"); + mdev->dev->caps.rl_caps.enable = 0; + } + /* + * Update the base QP upon fw update success, now the new base QP = + * the previous base QP + number of priorities, because fw reserves + * 1 qp per each RLPP + */ + mdev->dev->caps.fw_reserved_qp_base += all_num_rates.available_RPP; + } else { + en_err(priv, "Couldn't read available number of rates for port %d\n", port); + mdev->dev->caps.rl_caps.enable = 0; + } + } +#endif /* Query for default mac and max mtu */ priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port]; @@ -2116,6 +2281,25 @@ if (err) goto out; +#ifdef CONFIG_RATELIMIT + spin_lock_init(&priv->tx_ring_index_lock); + spin_lock_init(&priv->rl_op_lock); + STAILQ_INIT(&priv->reuse_index_list_head); + STAILQ_INIT(&priv->rl_op_list_head); + priv->rl_tq = taskqueue_create_fast("mlx4_en_rl_operation", M_NOWAIT, + taskqueue_thread_enqueue, &priv->rl_tq); + TASK_INIT(&priv->rl_task, 0, mlx4_en_async_rl_operation, priv); + taskqueue_start_threads(&priv->rl_tq, 1, PI_NET, "%s priv rl task", + if_name(priv->dev)); + + for (i = priv->native_tx_ring_num; i < MAX_TX_RINGS; i++) { + struct mlx4_en_reuse_index_list_element *reused_item; + reused_item = priv->reuse_index_list_array + i; + reused_item->val = i; + } + +#endif + /* Allocate page for receive rings */ err = mlx4_alloc_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE); @@ -2133,6 +2317,10 @@ dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; dev->if_capabilities |= IFCAP_LRO; +#ifdef CONFIG_RATELIMIT + if (priv->mdev->dev->caps.rl_caps.enable) + dev->if_capabilities |= IFCAP_TXRTLMT; +#endif if (mdev->LSO_support) dev->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTSO; @@ -2293,6 +2481,292 @@ return (error); } +#ifdef CONFIG_RATELIMIT +#define RATE 0x0 +#define BURST_SIZE 0x1 +#define RL_RINGS_BUFFER_LENGTH (25 * MLX4_EN_DEF_MAX_RL_TX_RINGS) + +/* Always call this function with a rate_limit_table_lock */ +static int mlx4_en_rl_locked_set(struct mlx4_en_priv *priv, u8 index, + u32 rate, u8 burst_size, int flag) +{ + int err = 0; + int i; + struct mlx4_qp_rl_index qp_rl_index; + u32 old_rate = priv->rate_limits[index].rate; + u8 old_burst = priv->rate_limits[index].burst_size; + + switch (flag) { + case RATE: + if (rate > priv->mdev->dev->caps.rl_caps.calc_max_val || + (rate < priv->mdev->dev->caps.rl_caps.calc_min_val && + rate != 0)) + return EINVAL; + priv->rate_limits[index].rate = rate; + break; + case BURST_SIZE: + priv->rate_limits[index].burst_size = burst_size; + /* If rate was not determined yet, not writing to FW */ + if (!priv->rate_limits[index].rate) + return err; + break; + default: + return EINVAL; + } + + memset(&qp_rl_index, 0, sizeof(qp_rl_index)); + for (i = 0; i < MLX4_NUM_PRIORITIES; i++) { + if (priv->mdev->lst_of_prios & (1 << i)) { + /* FW expects to receive rates in Kb/sec */ + qp_rl_index.rates[i] = ((priv->rate_limits[index].rate) / 1000); + qp_rl_index.burst_size[i] = priv->rate_limits[index].burst_size; + } + } + err = mlx4_set_rates_and_burst_size(priv->mdev->dev, priv->port, index, + &qp_rl_index); + if (err) { + priv->rate_limits[index].rate = old_rate; + priv->rate_limits[index].burst_size = old_burst; + en_err(priv, "Couldn't set port %d with new rate/burst size\n", + priv->port); + } + return err; +} + +static int mlx4_en_set_burst_for_index(SYSCTL_HANDLER_ARGS) +{ + struct mlx4_en_priv *priv = arg1; + int error; + u8 index = arg2; + u8 burst_size = 0; + char burst_buf[15] = "burst_low"; + + mutex_lock(&priv->rate_limit_table_lock); + + if (priv->rate_limits[index].burst_size) + strlcpy(burst_buf, "burst_high", sizeof(burst_buf)); + error = sysctl_handle_string(oidp, burst_buf, sizeof(burst_buf), req); + if (error != 0 || req->newptr == NULL) { + mutex_unlock(&priv->rate_limit_table_lock); + return (error); + } + if (strcmp(burst_buf, "burst_high") == 0) + burst_size = 1; + else if (strcmp(burst_buf, "burst_low") == 0) + burst_size = 0; + else { + mutex_unlock(&priv->rate_limit_table_lock); + en_err(priv, "Invalid value, value should be burst_high/burst_low\n"); + return EINVAL; + } + if (burst_size != priv->rate_limits[index].burst_size) + error = mlx4_en_rl_locked_set(priv, index, 0, burst_size, BURST_SIZE); + if (error) + en_err(priv, "Couldn't set burst size %u, for port %d\n", + burst_size, priv->port); + + mutex_unlock(&priv->rate_limit_table_lock); + return error; +} + +static int mlx4_en_set_rate_for_index(SYSCTL_HANDLER_ARGS) +{ + struct mlx4_en_priv *priv = arg1; + u8 index = arg2; + u32 rate; + int i; + int error; + u8 valid = 0; + + mutex_lock(&priv->rate_limit_table_lock); + rate = priv->rate_limits[index].rate; + if (!rate) { + valid = 1; + } + error = sysctl_handle_int(oidp, &rate, 0, req); + if (error || !req->newptr) { + mutex_unlock(&priv->rate_limit_table_lock); + return error; + } + for (i = 0; i <= priv->num_rates_per_prio; i++) { + if (priv->rate_limits[i].rate == rate) { + en_err(priv, "Rate already exists in index %d\n", i); + mutex_unlock(&priv->rate_limit_table_lock); + return error; + } + } + if (!valid) { + en_err(priv, "Rate for index %u already exists\n", index); + mutex_unlock(&priv->rate_limit_table_lock); + return error; + } + error = mlx4_en_rl_locked_set(priv, index, rate, 0, RATE); + if (error) + en_err(priv, "Couldn't set rate %u, at index %u for port %d\n",rate, + index, priv->port); + + mutex_unlock(&priv->rate_limit_table_lock); + + return error; +} + +/* This sysctl mib is not shown but only written to. that is why + * SYSCTL_IN is being used first - in order to read the user's data, + * and SYSCTL_OUT is at the end in order to show the user's update. */ +static int mlx4_en_set_rate_on_first_available_index(SYSCTL_HANDLER_ARGS) +{ + struct mlx4_en_priv *priv = arg1; + int error = 0; + int i; + + mutex_lock(&priv->rate_limit_table_lock); + + while ((priv->next_free_rl_index <= priv->num_rates_per_prio) && + priv->rate_limits[priv->next_free_rl_index].rate) + priv->next_free_rl_index++; + + if (req->newptr != NULL) { + SYSCTL_IN(req, &priv->new_rate_for_avail_index, sizeof(int)); + if (priv->next_free_rl_index > priv->num_rates_per_prio) { + en_err(priv, "No space left for new rates\n"); + priv->new_rate_for_avail_index = 0; + mutex_unlock(&priv->rate_limit_table_lock); + return ENOSPC; + } + for (i = 0; i <= priv->num_rates_per_prio; i++) { + if (priv->rate_limits[i].rate == priv->new_rate_for_avail_index) { + en_err(priv, "Rate already exists in index %d\n", i); + priv->new_rate_for_avail_index = 0; + mutex_unlock(&priv->rate_limit_table_lock); + return error; + } + } + error = mlx4_en_rl_locked_set(priv, priv->next_free_rl_index, + priv->new_rate_for_avail_index, 0, RATE); + if (error) { + en_err(priv, "Couldn't set rate %u, for port %d\n", + priv->new_rate_for_avail_index, priv->port); + priv->new_rate_for_avail_index = 0; + } + else { + priv->next_free_rl_index++; + } + } + + SYSCTL_OUT(req, &priv->new_rate_for_avail_index, sizeof(int)); + if (req->oldptr != NULL) + priv->new_rate_for_avail_index = 0; + mutex_unlock(&priv->rate_limit_table_lock); + return error; +} + +static void add_commas(char *str, u32 rate) +{ + char buffer_old[30]; + char buffer_tmp[30]; + int i, len; + int j = 0; + int cnt = 0; + + sprintf(buffer_old, "%u", rate); + len = strlen(buffer_old); + for (i=len-1; i>=0; i--){ + cnt++; + j++; + if (((cnt%3)==0) && (i!=0)) { + strcpy(&buffer_tmp[j-1],&buffer_old[i]); + strcpy(&buffer_tmp[j], ","); + j++; + } else { + strcpy(&buffer_tmp[j-1], &buffer_old[i]); + } + } + buffer_tmp[j] = '\0'; + len = strlen(buffer_tmp); + for (i=0; inum_rates_per_prio, req); + sbuf_printf(&sbuf,"\n\n INDEX CURRENTLY USED BURST RATE [bit/s]\n" + " ----------------------------------------------------\n"); + mutex_lock(&priv->rate_limit_table_lock); + for (i = 1; i <= priv->num_rates_per_prio; i++) { + add_commas((char *)&rate_buf, priv->rate_limits[i].rate); + sbuf_printf(&sbuf," %3d %d %s %s\n", i, + priv->rate_limits[i].ref, + (priv->rate_limits[i].burst_size ? "HIGH": "LOW"), rate_buf); + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + mutex_unlock(&priv->rate_limit_table_lock); + return (error); +} + +/* Shows all rate limit rings, their qp number and rate. + * Choose one for csv display - to dump to a file, + * Choose two for table display. */ +static int show_rate_limit_rings_list(SYSCTL_HANDLER_ARGS) +{ + static char buf[RL_RINGS_BUFFER_LENGTH] = {0}; + struct mlx4_en_priv *priv; + size_t len; + int i, ret_head, ret = 0; + priv = arg1; + + if (req->newptr != NULL) { + len = req->newlen - req->newidx; + SYSCTL_IN(req, buf, len); + if (strcmp(buf, "1") == 0) { + ret_head = sprintf(buf, + "\nCurrent number of rate limit rings: %d", + priv->rate_limit_tx_ring_num); + for (i = priv->native_tx_ring_num; i < priv->tx_ring_num; i++) { + if (priv->tx_ring[i] && + priv->tx_ring[i]->rl_data.user_valid == true) { + ret = ret + sprintf(buf+ret_head+ret, + "\n%d,%d,%u", i, priv->tx_ring[i]->qpn, + priv->rate_limits[priv->tx_ring[i]->rl_data.rate_index].rate); + } + } + buf[strlen(buf)] = '\0'; + } else if (strcmp(buf, "2") == 0) { + ret_head = sprintf(buf, "\nCurrent number of rate limit rings: %d", + priv->rate_limit_tx_ring_num); + ret_head = ret_head + sprintf(buf+ret_head, "\nRing ID QPN Rate"); + for (i = priv->native_tx_ring_num; i < priv->tx_ring_num; i++) { + if (priv->tx_ring[i] && + priv->tx_ring[i]->rl_data.user_valid == true) { + ret = ret + sprintf(buf+ret_head+ret, + "\n%d %d %u", + i, priv->tx_ring[i]->qpn, + priv->rate_limits[priv->tx_ring[i]->rl_data.rate_index].rate); + } + } + buf[strlen(buf)] = '\0'; + } else { + memset(&buf, 0, sizeof(buf)); + } + } + SYSCTL_OUT(req, buf, strlen(buf)+1); + if (req->oldptr != NULL) { + memset(&buf, 0, sizeof(buf)); + } + return (0); +} +#endif static int mlx4_en_set_tx_ppp(SYSCTL_HANDLER_ARGS) { @@ -2376,7 +2850,11 @@ struct sysctl_oid *coal; struct sysctl_oid_list *coal_list; const char *pnameunit; - +#ifdef CONFIG_RATELIMIT + u8 i; + char rate_index_name[30]; + char rate_index_desc[40]; +#endif dev = priv->dev; ctx = &priv->conf_ctx; pnameunit = device_get_nameunit(priv->mdev->pdev->dev.bsddev); @@ -2394,9 +2872,18 @@ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings", CTLFLAG_RD, &priv->rx_ring_num, 0, "Number of receive rings"); - SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings", - CTLFLAG_RD, &priv->tx_ring_num, 0, - "Number of transmit rings"); +#ifdef CONFIG_RATELIMIT + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "native_tx_rings", + CTLFLAG_RD, &priv->native_tx_ring_num, 0, + "Number of native transmit rings"); + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rate_limit_tx_rings", + CTLFLAG_RD, &priv->rate_limit_tx_ring_num, 0, + "Number of rate limit transmit rings"); +#else + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings", + CTLFLAG_RD, &priv->tx_ring_num, 0, + "Number of transmit rings"); +#endif SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_rx_ring_size, "I", "Receive ring size"); @@ -2415,7 +2902,34 @@ SYSCTL_ADD_STRING(ctx, node_list, OID_AUTO, "device_name", CTLFLAG_RD, __DECONST(void *, pnameunit), 0, "PCI device name"); - +#ifdef CONFIG_RATELIMIT + if (priv->mdev->dev->caps.rl_caps.enable) { + SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "num_rates", + CTLFLAG_RD, &priv->num_rates_per_prio, 0, + "number of rates supported"); + for (i = 1; i <= priv->num_rates_per_prio; i++) { + sprintf(rate_index_name, "rate_limit_%d", i); + sprintf(rate_index_desc, "set the rate for index %d", i); + SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, rate_index_name, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, i, + mlx4_en_set_rate_for_index, "I", rate_index_desc); + sprintf(rate_index_name, "burst_size_%d", i); + sprintf(rate_index_desc, "set the burst_size for index %d", i); + SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, rate_index_name, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, i, + mlx4_en_set_burst_for_index, "A", rate_index_desc); + } + SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "add_rate", + CTLTYPE_UINT | CTLFLAG_WR | CTLFLAG_MPSAFE | CTLFLAG_SKIP, priv, 0, + mlx4_en_set_rate_on_first_available_index, "I", "add rate to an available index"); + SYSCTL_ADD_OID(ctx, node_list, OID_AUTO, "rate_limit_show", + CTLTYPE_STRING | CTLFLAG_RD, priv, 0, mlx4_en_show_rate_table, + "A", "presentation of rate table"); + SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "dump_rate_limit_rings", + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + show_rate_limit_rings_list, "A", "list of all rate limit rings and rates"); + } +#endif /* Add coalescer configuration. */ coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration"); @@ -2459,6 +2973,9 @@ sysctl_ctx_init(ctx); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO, "stat", CTLFLAG_RD, NULL, "Statistics"); +#ifdef CONFIG_RATELIMIT + priv->sysctl_stat = node; +#endif node_list = SYSCTL_CHILDREN(node); #ifdef MLX4_EN_PERF_STAT @@ -2594,9 +3111,12 @@ &priv->pkstats.tx_gt_1548_bytes_packets, "TX Greater Then 1548 Bytes Packets"); - - +#ifdef CONFIG_RATELIMIT + for (i = 0; i < priv->native_tx_ring_num; i++) { + /* Rate limit rings stats are handled elsewhere */ +#else for (i = 0; i < priv->tx_ring_num; i++) { +#endif tx_ring = priv->tx_ring[i]; snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i); ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, Index: sys/ofed/drivers/net/mlx4/en_port.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_port.c +++ sys/ofed/drivers/net/mlx4/en_port.c @@ -191,6 +191,10 @@ priv->port_stats.queue_stopped = 0; priv->port_stats.wake_queue = 0; for (i = 0; i < priv->tx_ring_num; i++) { +#ifdef CONFIG_RATELIMIT + if (!TX_RING_USER_VALID(i)) + continue; +#endif priv->port_stats.tx_chksum_offload += priv->tx_ring[i]->tx_csum; priv->port_stats.queue_stopped += priv->tx_ring[i]->queue_stopped; priv->port_stats.wake_queue += priv->tx_ring[i]->wake_queue; Index: sys/ofed/drivers/net/mlx4/en_tx.c =================================================================== --- sys/ofed/drivers/net/mlx4/en_tx.c +++ sys/ofed/drivers/net/mlx4/en_tx.c @@ -39,6 +39,11 @@ #include #include +#ifdef CONFIG_RATELIMIT +#include +#include +#endif + #include #include #include @@ -82,14 +87,32 @@ ring->size = size; ring->size_mask = size - 1; ring->stride = stride; +#ifdef CONFIG_RATELIMIT + ring->rl_data.rate_index = 0; + /* User_valid should be false in a rate_limit ring until the + * creation process of the ring is done, after the activation. */ + if (queue_idx < priv->native_tx_ring_num) + ring->rl_data.user_valid = true; + else + ring->rl_data.user_valid = false; +#endif ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS; ring->inline_thold = min(inline_thold, MAX_INLINE); mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); /* Allocate the buf ring */ +#ifdef CONFIG_RATELIMIT + if (queue_idx < priv->native_tx_ring_num) + ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF, + M_WAITOK, &ring->tx_lock.m); + else + ring->br = buf_ring_alloc(size / 4, M_DEVBUF, M_WAITOK, + &ring->tx_lock.m); +#else ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF, - M_WAITOK, &ring->tx_lock.m); + M_WAITOK, &ring->tx_lock.m); +#endif if (ring->br == NULL) { en_err(priv, "Failed allocating tx_info ring\n"); return -ENOMEM; @@ -225,8 +248,23 @@ ring->qp_state = MLX4_QP_STATE_RST; ring->doorbell_qpn = ring->qp.qpn << 8; +#ifdef CONFIG_RATELIMIT + if (ring->rl_data.rate_index) { + /* Force rate limit user priority */ + user_prio = MLX4_EN_DEF_RL_USER_PRIO; + } +#endif + mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, ring->cqn, user_prio, &ring->context); + +#ifdef CONFIG_RATELIMIT + if (ring->rl_data.rate_index) { + ring->context.rate_limit_index = ring->rl_data.rate_index; + ring->context.sq_size_stride |= MLX4_DISABLE_WQE_PREFETCH; + } +#endif + if (ring->bf_enabled) ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); @@ -235,6 +273,448 @@ return err; } +#ifdef CONFIG_RATELIMIT +static int mlx4_en_find_available_tx_ring_index(struct mlx4_en_priv *priv) +{ + int index = -1; + struct mlx4_en_reuse_index_list_element *reused_item; + + spin_lock(&priv->tx_ring_index_lock); + /* Check for availble index in re-use list */ + if ((reused_item = STAILQ_FIRST(&priv->reuse_index_list_head))) { + index = reused_item->val; + /* Remove head index from re-use list */ + STAILQ_REMOVE_HEAD(&priv->reuse_index_list_head, entry); + } + else if (priv->tx_ring_num < MAX_TX_RINGS) { + index = priv->tx_ring_num; + priv->tx_ring_num++; + } else /* Reached max resources capacity */ + index = -1; + spin_unlock(&priv->tx_ring_index_lock); + + return index; +} + +/* Check whether the requested rate is valid. + * If so, retrieve the relevant rate index. */ +static int mlx4_en_validate_rate_ctl_req(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req, u8 *rate_index) +{ + int i; + u32 rate; + + /* Kernel passes rate in bytes and the driver converts it to bits in order + * to communicate with the hardware. */ + rl_req->txringid_max_rate = rl_req->txringid_max_rate * BITS_PER_BYTE; + rate = rl_req->txringid_max_rate; + + if (rate > priv->mdev->dev->caps.rl_caps.calc_max_val || + (rate < priv->mdev->dev->caps.rl_caps.calc_min_val && + rate != 0)) { + en_err(priv, "Not valid rate limit : %u Bps %d\n",rate / BITS_PER_BYTE, priv->port); + return (EINVAL); + } + + /* Searching for the requested rate in the rate table */ + for (i = 0; i <= priv->num_rates_per_prio; i++) { + if (priv->rate_limits[i].rate == rate) { + *rate_index = i; + return (0); + } + } + + en_err(priv, "Not existing rate limit %u Bps %d\n",rate / BITS_PER_BYTE, priv->port); + return (EINVAL); +} + +void mlx4_en_invalidate_rl_ring(struct mlx4_en_priv *priv, uint32_t ring_id) +{ + priv->tx_ring[ring_id]->rl_data.user_valid = false; + sysctl_ctx_free(&priv->tx_ring[ring_id]->rl_data.rl_stats_ctx); +} + +void mlx4_en_rl_reused_index_insert(struct mlx4_en_priv *priv, uint32_t ring_id) +{ + struct mlx4_en_reuse_index_list_element *reused_item; + + reused_item = priv->reuse_index_list_array + ring_id; + spin_lock(&priv->tx_ring_index_lock); + STAILQ_INSERT_TAIL(&priv->reuse_index_list_head, reused_item, entry); + spin_unlock(&priv->tx_ring_index_lock); +} + +static void mlx4_en_rate_limit_sysctl_stat(struct mlx4_en_priv *priv, int ring_id) +{ + struct mlx4_en_tx_ring *tx_ring; + struct sysctl_ctx_list *ctx; + struct sysctl_oid_list *head_node; + struct sysctl_oid *ring_node; + struct sysctl_oid_list *ring_list; + char namebuf[128]; + + tx_ring = priv->tx_ring[ring_id]; + ctx = &tx_ring->rl_data.rl_stats_ctx; + snprintf(namebuf, sizeof(namebuf), "tx_ring%d", ring_id); + head_node = SYSCTL_CHILDREN(priv->sysctl_stat); + ring_node = SYSCTL_ADD_NODE(ctx, head_node, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "TX Ring"); + ring_list = SYSCTL_CHILDREN(ring_node); + SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "rate_limit_val", + CTLFLAG_RD, &priv->rate_limits[tx_ring->rl_data.rate_index].rate, 0, "Rate Limit value"); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets", + CTLFLAG_RD, &tx_ring->packets, "TX packets"); + SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes", + CTLFLAG_RD, &tx_ring->bytes, "TX bytes"); +} + +static int mlx4_en_defer_rl_op(struct mlx4_en_priv *priv, + int ring_id, u8 rate_index, + enum mlx4_en_rl_operation opp) +{ + struct mlx4_en_rl_task_list_element *rl_item; + + rl_item = kmalloc(sizeof(struct mlx4_en_rl_task_list_element), M_NOWAIT); + if (!rl_item) { + en_err(priv, "Failed allocating rl_item\n"); + return (ENOMEM); + } + + /* Saving recieved data from kernel in order to use it later in + * the defer function */ + rl_item->ring_id = ring_id; + rl_item->rate_index = rate_index; + rl_item->operation = opp; + + spin_lock(&priv->rl_op_lock); + STAILQ_INSERT_TAIL(&priv->rl_op_list_head, rl_item, entry); + spin_unlock(&priv->rl_op_lock); + taskqueue_enqueue(priv->rl_tq, &priv->rl_task); + return (0); +} + +int mlx4_en_create_rate_limit_ring(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req) +{ + int err = 0; + int index = 0; + u8 rate_index; + + /* Check for HW/FW support */ + if (!priv->mdev->dev->caps.rl_caps.enable) { + en_err(priv, "No HW/FW support for rate limit rings\n"); + return (ENODEV); + } + + /* Validate rate limit request */ + if(mlx4_en_validate_rate_ctl_req(priv, rl_req, &rate_index)) + return (EINVAL); + + /* Find available ring index */ + index = mlx4_en_find_available_tx_ring_index(priv); + + if (index < 0) { + en_err(priv, "Failed to create Rate limit resources, " + "Max capacity reached\n"); + return (EINVAL); + } + + atomic_add_int(&priv->rate_limits[rate_index].ref, 1); + rl_req->txringid = index; + + /* Defer ring creation */ + err = mlx4_en_defer_rl_op(priv, rl_req->txringid, rate_index, MLX4_EN_RL_ADD); + + return err; +} + +int mlx4_en_modify_rate_limit_ring(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req) +{ + u8 rate_index; + int err = 0; + + /* Validate rate limit request */ + if(mlx4_en_validate_rate_ctl_req(priv, rl_req, &rate_index)) + return (err); + + atomic_add_int(&priv->rate_limits[rate_index].ref, 1); + + /* Validation for ring index occurs at the deffered function + * in order to prevent failure when creation was not completed + * yet (defered actions are executed by one thread) */ + + /* Defer ring modification */ + err = mlx4_en_defer_rl_op(priv, rl_req->txringid, rate_index, MLX4_EN_RL_MOD); + + return (err); +} + +int mlx4_en_destroy_rate_limit_ring(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req) +{ + uint32_t ring_id; + int err = 0; + + ring_id = rl_req->txringid; + + /* Check that this is indeed a rate limit ring */ + if (ring_id < priv->native_tx_ring_num || ring_id >= priv->tx_ring_num) { + en_err(priv, "Deleting ring %d: Permision denied: Not a rate limit ring\n", + ring_id); + return (EINVAL); + } + + /* Defer ring destruction */ + /* There is no handling with new rate index when destroying a ring + * therefor, sending zero as a rate index. */ + err = mlx4_en_defer_rl_op(priv, rl_req->txringid, 0, MLX4_EN_RL_DEL); + + return err; +} + +static void mlx4_en_create_rl_res(struct mlx4_en_priv *priv, + int ring_id, u8 rate_index) +{ + struct mlx4_en_cq *cq; + struct mlx4_en_tx_ring *tx_ring; + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + int node = 0; + int j; + + + if (priv->tx_ring[ring_id]) { + /* Ring already exists, needs activation */ + /* Make sure drbr queue has no left overs from before */ + tx_ring = priv->tx_ring[ring_id]; + goto activate; + } + + err = mlx4_en_create_cq(priv, &priv->tx_cq[ring_id], + MLX4_EN_DEF_RL_TX_RING_SIZE, ring_id, TX, node); + if (err) { + en_err(priv, "Failed to create rate limit tx CQ, ring index %u, rate %u\n", + ring_id, rate_index); + goto err_create_cq; + } + + err = mlx4_en_create_tx_ring(priv, &priv->tx_ring[ring_id], + MLX4_EN_DEF_RL_TX_RING_SIZE, TXBB_SIZE, node, ring_id); + if (err) { + en_err(priv, "Failed to create rate limited tx ring %u, rate %u\n", + ring_id, rate_index); + goto err_create_ring; + } + + tx_ring = priv->tx_ring[ring_id]; + +activate: + + sysctl_ctx_init(&tx_ring->rl_data.rl_stats_ctx); + tx_ring->rl_data.rate_index = rate_index; + + /* Default moderation */ + cq = priv->tx_cq[ring_id]; + cq->moder_cnt = priv->tx_frames; + cq->moder_time = priv->tx_usecs; + + mutex_lock(&mdev->state_lock); + if (!priv->port_up) { + /* No need activating resources, start_port will take care of that */ + tx_ring->rl_data.user_valid = true; + mutex_unlock(&mdev->state_lock); + return; + } + + /* Activate resources */ + err = mlx4_en_activate_cq(priv, cq, ring_id); + if (err) { + en_err(priv, "Failed activating Rate Limit Tx CQ\n"); + goto err_activate_resources; + } + + err = mlx4_en_set_cq_moder(priv, cq); + if (err) { + en_err(priv, "Failed setting cq moderation parameters"); + mlx4_en_deactivate_cq(priv, cq); + goto err_activate_resources; + } + en_dbg(DRV, priv, "Resetting index of CQ:%d to -1\n", ring_id); + cq->buf->wqe_index = cpu_to_be16(0xffff); + + err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn, + MLX4_EN_DEF_RL_USER_PRIO); + if (err) { + en_err(priv, "Failed activating rate limit TX ring\n"); + mlx4_en_deactivate_cq(priv, cq); + goto err_activate_resources; + } + + /* Arm CQ for TX completions */ + mlx4_en_arm_cq(priv, cq); + + /* Set initial ownership of all Tx TXBBs to SW (1) */ + for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE) + *((u32 *) (tx_ring->buf + j)) = 0xffffffff; + + /* Set ring as valid */ + tx_ring->rl_data.user_valid = true; + mutex_unlock(&mdev->state_lock); + + priv->rate_limit_tx_ring_num++; + + /* Add rate limit statistics to sysctl if debug option was enabled */ + if (show_rl_sysctl_info) + mlx4_en_rate_limit_sysctl_stat(priv, ring_id); + return; + +err_activate_resources: + mlx4_en_invalidate_rl_ring(priv, ring_id); + mlx4_en_rl_reused_index_insert(priv, ring_id); + atomic_subtract_int(&priv->rate_limits[rate_index].ref, 1); + mutex_unlock(&mdev->state_lock); + return; + +err_create_ring: + if (priv->tx_cq[ring_id]) + mlx4_en_destroy_cq(priv, &priv->tx_cq[ring_id]); + +err_create_cq: + mlx4_en_rl_reused_index_insert(priv, ring_id); + atomic_subtract_int(&priv->rate_limits[rate_index].ref, 1); +} + +static void mlx4_en_modify_rl_res(struct mlx4_en_priv *priv, + int ring_id, u8 rate_index) +{ + struct mlx4_en_tx_ring *tx_ring; + struct mlx4_update_qp_params update_params; + int err; + + tx_ring = priv->tx_ring[ring_id]; + + /* Ring validation */ + if(!TX_RING_USER_VALID(ring_id)) { + en_err(priv, "Failed modifying new rate, ring %d doesn't exist\n", ring_id); + /* If the modified ring does not exist, no need to add one + * to the reference count of the requested rate */ + atomic_subtract_int(&priv->rate_limits[rate_index].ref, 1); + return; + } + + if (priv->rate_limits[tx_ring->rl_data.rate_index].rate != + priv->rate_limits[rate_index].rate) { + update_params.rl_index = rate_index; + err = mlx4_update_qp(priv->mdev->dev, tx_ring->qpn, + MLX4_UPDATE_QP_RATE_LIMIT, &update_params); + if (err) { + en_err(priv, "Failed updating ring %d with new rate %uBytes/sec, err: %d\n", + ring_id, (priv->rate_limits[rate_index].rate/8), err); + atomic_subtract_int(&priv->rate_limits[rate_index].ref, 1); + return; + } + } + atomic_subtract_int(&priv->rate_limits[tx_ring->rl_data.rate_index].ref, 1); + tx_ring->rl_data.rate_index = rate_index; +} + +static void mlx4_en_destroy_rl_res(struct mlx4_en_priv *priv, + int ring_id) +{ + struct mlx4_en_tx_ring *ring; + struct mlx4_en_dev *mdev = priv->mdev; + + ring = priv->tx_ring[ring_id]; + + mutex_lock(&mdev->state_lock); + + /* Index was validated, thus ring is not NULL */ + spin_lock(&ring->tx_lock); + if (ring->rl_data.user_valid == false) { + en_err(priv, "ring %d doesn't exist\n", ring_id); + spin_unlock(&ring->tx_lock); + return; + } else { + ring->rl_data.user_valid = false; + } + if (!drbr_empty(priv->dev, ring->br)) { + struct mbuf *m; + while ((m = buf_ring_dequeue_sc(ring->br)) != NULL) { + m_freem(m); + } + } + spin_unlock(&ring->tx_lock); + atomic_subtract_int(&priv->rate_limits[ring->rl_data.rate_index].ref, 1); + + /* Deactivate resources */ + if (priv->port_up) { + mlx4_en_deactivate_tx_ring(priv, ring); + mlx4_en_deactivate_cq(priv, priv->tx_cq[ring_id]); + msleep(10); + mlx4_en_free_tx_buf(priv->dev, ring); + } + mutex_unlock(&mdev->state_lock); + + /* clear statistics */ + ring->bytes = 0; + ring->packets = 0; + + sysctl_ctx_free(&ring->rl_data.rl_stats_ctx); + + /* Add index to re-use list */ + priv->rate_limit_tx_ring_num--; + mlx4_en_rl_reused_index_insert(priv, ring_id); +} + +/* Called from the rl_task context, it acquires the first + * task from the rl_op_list and calls the relevant functions according to + * the needed operation. */ +void mlx4_en_async_rl_operation(void *context, int pending) +{ + struct mlx4_en_priv *priv; + struct mlx4_en_rl_task_list_element *rl_item; + enum mlx4_en_rl_operation rl_operation; + int ring_id; + u8 rate_index; + + priv = context; + + while(pending){ + /* Check for availble operation in the operation list */ + spin_lock(&priv->rl_op_lock); + if ((rl_item = STAILQ_FIRST(&priv->rl_op_list_head))) { + ring_id = rl_item->ring_id; + rl_operation = rl_item->operation; + rate_index = rl_item->rate_index; + STAILQ_REMOVE_HEAD(&priv->rl_op_list_head, entry); + spin_unlock(&priv->rl_op_lock); + kfree(rl_item); + } + else { + spin_unlock(&priv->rl_op_lock); + pr_err("No avaliable rate limit item \n"); + return; + } + + switch (rl_operation){ + case MLX4_EN_RL_ADD: + mlx4_en_create_rl_res(priv, ring_id, rate_index); + break; + case MLX4_EN_RL_DEL: + mlx4_en_destroy_rl_res(priv, ring_id); + break; + case MLX4_EN_RL_MOD: + mlx4_en_modify_rl_res(priv, ring_id, rate_index); + break; + default: + pr_err("Not supported operation - %d \n", rl_operation); + } + pending--; + } +} +#endif + void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring) { @@ -451,10 +931,18 @@ if (unlikely(ring->blocked) && (ring->prod - ring->cons) <= ring->full_size) { ring->blocked = 0; +#ifdef CONFIG_RATELIMIT + if (cq->ring < priv->native_tx_ring_num) { + if (atomic_fetchadd_int(&priv->blocked, -1) == 1) + atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); + priv->port_stats.wake_queue++; + } +#else if (atomic_fetchadd_int(&priv->blocked, -1) == 1) atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); - ring->wake_queue++; priv->port_stats.wake_queue++; +#endif + ring->wake_queue++; } return done; } @@ -817,19 +1305,30 @@ * stopping the queue */ if (unlikely((int)(ring->prod - ring->cons) > ring->full_size)) { +#ifdef CONFIG_RATELIMIT + if (tx_ind < priv->native_tx_ring_num) { + /* every full native Tx ring stops queue */ + if (ring->blocked == 0) + atomic_add_int(&priv->blocked, 1); + /* Set HW-queue-is-full flag */ + atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE); + priv->port_stats.queue_stopped++; + } +#else /* every full Tx ring stops queue */ if (ring->blocked == 0) - atomic_add_int(&priv->blocked, 1); + atomic_add_int(&priv->blocked, 1); /* Set HW-queue-is-full flag */ atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE); - ring->blocked = 1; priv->port_stats.queue_stopped++; +#endif + ring->blocked = 1; ring->queue_stopped++; /* Use interrupts to find out when queue opened */ cq = priv->tx_cq[tx_ind]; mlx4_en_arm_cq(priv, cq); - return EBUSY; + return (ENOBUFS); } /* Track current inflight packets for performance analysis */ @@ -1010,8 +1509,14 @@ int enqueued, err = 0; ring = priv->tx_ring[tx_ind]; + +#ifdef CONFIG_RATELIMIT + if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING || priv->port_up == 0 || ring->blocked == 1) { +#else if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || priv->port_up == 0) { +#endif if (m != NULL) err = drbr_enqueue(dev, ring->br, m); return (err); @@ -1057,11 +1562,15 @@ priv = dev->if_softc; tx_ind = cq->ring; ring = priv->tx_ring[tx_ind]; + if (dev->if_drv_flags & IFF_DRV_RUNNING) { mlx4_en_xmit_poll(priv, tx_ind); spin_lock(&ring->tx_lock); if (!drbr_empty(dev, ring->br)) - mlx4_en_transmit_locked(dev, tx_ind, NULL); +#ifdef CONFIG_RATELIMIT + if (ring->rl_data.user_valid) +#endif + mlx4_en_transmit_locked(dev, tx_ind, NULL); spin_unlock(&ring->tx_lock); } } @@ -1074,17 +1583,51 @@ struct mlx4_en_cq *cq; int i, err = 0; +#ifdef CONFIG_RATELIMIT + /*Check mbuf if this is a rate limit packet*/ + if (M_HASHTYPE_TEST(m, M_HASHTYPE_TXRTLMT)) { + /*make sure the ring is allocated*/ + if ( priv->tx_ring[m->m_pkthdr.flowid] ) + i = m->m_pkthdr.flowid; + else + i = mlx4_en_select_queue(dev, m); + } + else +#endif /* Compute which queue to use */ i = mlx4_en_select_queue(dev, m); +#ifdef CONFIG_RATELIMIT +lock_and_transmit: +#endif ring = priv->tx_ring[i]; - if (spin_trylock(&ring->tx_lock)) { +#ifdef CONFIG_RATELIMIT + if (ring->rl_data.user_valid == false) { + /* Rate limit ring is not active */ + spin_unlock(&ring->tx_lock); + i = mlx4_en_select_queue(dev, m); + goto lock_and_transmit; + + } +#endif err = mlx4_en_transmit_locked(dev, i, m); spin_unlock(&ring->tx_lock); /* Poll CQ here */ mlx4_en_xmit_poll(priv, i); } else { +#ifdef CONFIG_RATELIMIT + /* + * This is the only place where we check user_valid without tx_lock + * It is ok because the design is that destroy and transmit will not + * happen in parallel on the same ring (tcp_output code). + */ + if (ring->rl_data.user_valid == false) { + /* Rate limit ring is not active */ + i = mlx4_en_select_queue(dev, m); + goto lock_and_transmit; + } +#endif err = drbr_enqueue(dev, ring->br, m); cq = priv->tx_cq[i]; taskqueue_enqueue(cq->tq, &cq->cq_task); @@ -1105,6 +1648,10 @@ for (int i = 0; i < priv->tx_ring_num; i++) { ring = priv->tx_ring[i]; +#ifdef CONFIG_RATELIMIT + if (!ring) + continue; +#endif spin_lock(&ring->tx_lock); while ((m = buf_ring_dequeue_sc(ring->br)) != NULL) m_freem(m); Index: sys/ofed/drivers/net/mlx4/fw.h =================================================================== --- sys/ofed/drivers/net/mlx4/fw.h +++ sys/ofed/drivers/net/mlx4/fw.h @@ -123,6 +123,9 @@ u32 sync_qp; u8 timestamp_support; u32 max_extended_counters; +#ifdef CONFIG_RATELIMIT + struct mlx4_rate_limit_caps rl_caps; +#endif }; struct mlx4_func_cap { Index: sys/ofed/drivers/net/mlx4/fw.c =================================================================== --- sys/ofed/drivers/net/mlx4/fw.c +++ sys/ofed/drivers/net/mlx4/fw.c @@ -560,6 +560,11 @@ #define QUERY_DEV_CAP_ETS_CFG_OFFSET 0x9c #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 +#ifdef CONFIG_RATELIMIT +#define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET 0xd8 +#define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET 0xda +#endif + dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -724,7 +729,19 @@ dev_cap->max_rq_sg = field; MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET); dev_cap->max_rq_desc_sz = size; +#ifdef CONFIG_RATELIMIT + MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET); + dev_cap->rl_caps.max_unit = size >> 14; + dev_cap->rl_caps.max_val = size & 0xfff; + + if (dev_cap->rl_caps.max_val > 0) { + dev_cap->rl_caps.enable = 1; + MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET); + dev_cap->rl_caps.min_unit = size >> 14; + dev_cap->rl_caps.min_val = size & 0xfff; + } +#endif MLX4_GET(dev_cap->bmme_flags, outbox, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); MLX4_GET(dev_cap->reserved_lkey, outbox, Index: sys/ofed/drivers/net/mlx4/main.c =================================================================== --- sys/ofed/drivers/net/mlx4/main.c +++ sys/ofed/drivers/net/mlx4/main.c @@ -44,10 +44,17 @@ #include #include #include +#ifdef CONFIG_RATELIMIT +#include +#endif #include #include +#ifdef CONFIG_RATELIMIT +#include +#endif + #include "mlx4.h" #include "fw.h" #include "icm.h" @@ -590,10 +597,37 @@ dev->caps.port_mask[i] = dev->caps.port_type[i]; } +#ifdef CONFIG_RATELIMIT +static u64 mlx4_calc_rl_supported_rate(u16 rate_val, u8 unit) +{ + u32 calc_rate = 0; + u64 rate = (u64)rate_val; + + switch (unit) { + case MLX4_QP_RATE_LIMIT_KBPS: + calc_rate = rate * (u64)(1000); + break; + case MLX4_QP_RATE_LIMIT_MBPS: + calc_rate = rate * (u64)(1000*1000); + break; + case MLX4_QP_RATE_LIMIT_GBPS: + calc_rate = rate * (u64)(1000*1000*1000); + break; + default: + calc_rate = 0; + } + + return calc_rate; +} +#endif + static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; int i; +#ifdef CONFIG_RATELIMIT + u16 available_RLPP = 0; +#endif err = mlx4_QUERY_DEV_CAP(dev, dev_cap); if (err) { @@ -645,6 +679,26 @@ dev->caps.trans_code[i] = dev_cap->trans_code[i]; } +#ifdef CONFIG_RATELIMIT + /* UNIT must be determined in order to have a valid max/min rate */ + if (dev_cap->rl_caps.enable) { + dev->caps.rl_caps.max_unit = dev_cap->rl_caps.max_unit; + dev->caps.rl_caps.max_val = dev_cap->rl_caps.max_val; + dev->caps.rl_caps.min_unit = dev_cap->rl_caps.min_unit; + dev->caps.rl_caps.min_val = dev_cap->rl_caps.min_val; + dev->caps.rl_caps.calc_max_val = mlx4_calc_rl_supported_rate(dev->caps.rl_caps.max_val, + dev->caps.rl_caps.max_unit); + dev->caps.rl_caps.calc_min_val = mlx4_calc_rl_supported_rate(dev->caps.rl_caps.min_val, + dev->caps.rl_caps.min_unit); + if (!dev->caps.rl_caps.calc_max_val || !dev->caps.rl_caps.calc_min_val) + dev->caps.rl_caps.enable = 0; + else + dev->caps.rl_caps.enable = dev_cap->rl_caps.enable; + } else { + dev->caps.rl_caps.enable = 0; + } +#endif + dev->caps.uar_page_size = PAGE_SIZE; dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; @@ -787,6 +841,25 @@ else dev->caps.max_counters = dev->caps.max_basic_counters; +#ifdef CONFIG_RATELIMIT + if (dev->caps.rl_caps.enable) { + /* Use fw_reserved_qp_base to preserve the original reserved_qps value */ + dev->caps.fw_reserved_qp_base = dev_cap->reserved_qps; + for (i = 1; i <= dev_cap->num_ports; i++){ + err = mlx4_query_rl_fw_resources(dev, i, &available_RLPP); + if (err) { + mlx4_err(dev, "Couldn't read available number of rates for port %d\n", i); + dev->caps.rl_caps.enable = 0; + /* Reset reserved_qps to original value */ + dev_cap->reserved_qps = dev->caps.fw_reserved_qp_base; + break; + } + else + dev_cap->reserved_qps += available_RLPP; + } + } +#endif + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] = @@ -1791,6 +1864,10 @@ static void mlx4_close_hca(struct mlx4_dev *dev) { +#ifdef CONFIG_RATELIMIT + if (dev->caps.rl_caps.enable) + sysctl_ctx_free(&dev->rl_ctx); +#endif unmap_internal_clock(dev); unmap_bf_area(dev); if (mlx4_is_slave(dev)) { @@ -1948,6 +2025,68 @@ dev->oper_log_mgm_entry_size, mlx4_log_num_mgm_entry_size); } +#ifdef CONFIG_RATELIMIT +static char* mlx4_sysctl_rate_limit_unit_str(u8 val) +{ + char *unit_str = NULL; + + switch (val) { + case MLX4_QP_RATE_LIMIT_KBPS: + unit_str = "Kbps"; + break; + case MLX4_QP_RATE_LIMIT_MBPS: + unit_str = "Mbps"; + break; + case MLX4_QP_RATE_LIMIT_GBPS: + unit_str = "Gbps"; + break; + default: + break; + } + return unit_str; +} + +static void mlx4_sysctl_rate_limit_caps(struct mlx4_dev *dev) +{ + struct sysctl_ctx_list *ctx; + struct sysctl_oid *node; + struct sysctl_oid_list *node_list; + char *min_unit_str; + char *max_unit_str; + char *min_namebuf; + char *max_namebuf; + int min_namebuf_size; + int max_namebuf_size; + + min_namebuf = dev->caps.rl_caps.min_val_str; + min_namebuf_size = sizeof(dev->caps.rl_caps.min_val_str); + max_namebuf = dev->caps.rl_caps.max_val_str; + max_namebuf_size = sizeof(dev->caps.rl_caps.max_val_str); + + ctx = &dev->rl_ctx; + sysctl_ctx_init(ctx); + node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->pdev->dev.kobj.oidp), + OID_AUTO, "rate_limit_caps" , CTLFLAG_RD, 0, "rate limit capabilitiess"); + node_list = SYSCTL_CHILDREN(node); + + if ((min_unit_str = mlx4_sysctl_rate_limit_unit_str(dev->caps.rl_caps.min_unit))) { + snprintf(min_namebuf, min_namebuf_size, + "%d %s", dev->caps.rl_caps.min_val, min_unit_str); + } + if ((max_unit_str = mlx4_sysctl_rate_limit_unit_str(dev->caps.rl_caps.max_unit))) { + snprintf(max_namebuf, max_namebuf_size, + "%d %s", dev->caps.rl_caps.max_val, max_unit_str); + } + + SYSCTL_ADD_STRING(ctx, node_list, OID_AUTO, "min_value", + CTLFLAG_RD, min_namebuf, 0, + "Min rate limit value supported [bits/second]"); + SYSCTL_ADD_STRING(ctx, node_list, OID_AUTO, "max_value", + CTLFLAG_RD, max_namebuf, 0, + "Max rate limit value supported [bits/second]"); +} +#endif + static int mlx4_init_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -1994,6 +2133,11 @@ goto err_stop_fw; } +#ifdef CONFIG_RATELIMIT + if (dev->caps.rl_caps.enable) + mlx4_sysctl_rate_limit_caps(dev); +#endif + choose_steering_mode(dev, dev_cap); if (mlx4_is_master(dev)) @@ -2538,6 +2682,119 @@ } EXPORT_SYMBOL_GPL(mlx4_counter_free); +#ifdef CONFIG_RATELIMIT +int mlx4_query_rl_fw_resources(struct mlx4_dev *dev, u8 port, + u16 *available_RLPP) +{ + struct mlx4_hw_num_of_rates *hw_all_num_rates; + int err; + struct mlx4_cmd_mailbox *mailbox_out = NULL; + u64 mailbox_in_dma = 0; + u32 inmod = port; + + mailbox_out = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox_out)) + return -ENOMEM; + hw_all_num_rates = (struct mlx4_hw_num_of_rates *) mailbox_out->buf; + + err = mlx4_cmd_box(dev, mailbox_in_dma, mailbox_out->dma, + inmod, MLX4_QUERY_RLPP_FOR_PORT, + MLX4_CMD_QP_RLPP, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + if (!err) + *available_RLPP = be16_to_cpu(hw_all_num_rates->available_RPP); + + mlx4_free_cmd_mailbox(dev, mailbox_out); + return err; +} + +int mlx4_alloc_rl_fw_resources(struct mlx4_dev *dev, u8 port, + struct mlx4_num_of_rates *all_num_rates) +{ + /* initialize struct mlx4_hw_num_of_rates because prios 8-15 are not is use for now */ + struct mlx4_hw_num_of_rates *hw_all_num_rates = {0}; + int err; + struct mlx4_cmd_mailbox *mailbox_in = NULL; + u64 mailbox_in_dma = 0; + u32 inmod = port; + + mailbox_in = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox_in)) + return -ENOMEM; + + mailbox_in_dma = mailbox_in->dma; + hw_all_num_rates = (struct mlx4_hw_num_of_rates *) mailbox_in->buf; + + hw_all_num_rates->RPP_prio_0 = all_num_rates->RPP_per_prio[0]; + hw_all_num_rates->RPP_prio_1 = all_num_rates->RPP_per_prio[1]; + hw_all_num_rates->RPP_prio_2 = all_num_rates->RPP_per_prio[2]; + hw_all_num_rates->RPP_prio_3 = all_num_rates->RPP_per_prio[3]; + hw_all_num_rates->RPP_prio_4 = all_num_rates->RPP_per_prio[4]; + hw_all_num_rates->RPP_prio_5 = all_num_rates->RPP_per_prio[5]; + hw_all_num_rates->RPP_prio_6 = all_num_rates->RPP_per_prio[6]; + hw_all_num_rates->RPP_prio_7 = all_num_rates->RPP_per_prio[7]; + + /* Update the fw the needed amount of reserved qp's for RLPPs */ + hw_all_num_rates->base_qp_num |= cpu_to_be32(all_num_rates->base_qp_num & 0xffffff); + + err = mlx4_cmd(dev, mailbox_in_dma, inmod, + MLX4_ALLOCATE_RLPP_FOR_PORT, + MLX4_CMD_QP_RLPP, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + mlx4_free_cmd_mailbox(dev, mailbox_in); + return err; +} + +int mlx4_set_rates_and_burst_size(struct mlx4_dev *dev, u8 port, u8 index, + struct mlx4_qp_rl_index *qp_rl_index) +{ + struct mlx4_cmd_mailbox *mailbox_in = NULL; + /* initialize struct mlx4_hw_num_of_rates because prios 8-15 are not is use for now */ + struct mlx4_hw_qp_rl_index *hw_qp_rl_index = {0}; + int err = 0; + u64 mailbox_in_dma = 0; + u32 inmod = port | (index << 8); + + mailbox_in = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox_in)) + return -ENOMEM; + mailbox_in_dma = mailbox_in->dma; + hw_qp_rl_index = (struct mlx4_hw_qp_rl_index *) mailbox_in->buf; + + hw_qp_rl_index->rate_prio_0 = cpu_to_be32(qp_rl_index->rates[0]); + hw_qp_rl_index->burst_size_prio_0 |= (qp_rl_index->burst_size[0] & 1); + + hw_qp_rl_index->rate_prio_1 = cpu_to_be32(qp_rl_index->rates[1]); + hw_qp_rl_index->burst_size_prio_1 |= (qp_rl_index->burst_size[1] & 1); + + hw_qp_rl_index->rate_prio_2 = cpu_to_be32(qp_rl_index->rates[2]); + hw_qp_rl_index->burst_size_prio_2 |= (qp_rl_index->burst_size[2] & 1); + + hw_qp_rl_index->rate_prio_3 = cpu_to_be32(qp_rl_index->rates[3]); + hw_qp_rl_index->burst_size_prio_3 |= (qp_rl_index->burst_size[3] & 1); + + hw_qp_rl_index->rate_prio_4 = cpu_to_be32(qp_rl_index->rates[4]); + hw_qp_rl_index->burst_size_prio_4 |= (qp_rl_index->burst_size[4] & 1); + + hw_qp_rl_index->rate_prio_5 = cpu_to_be32(qp_rl_index->rates[5]); + hw_qp_rl_index->burst_size_prio_5 |= (qp_rl_index->burst_size[5] & 1); + + hw_qp_rl_index->rate_prio_6 = cpu_to_be32(qp_rl_index->rates[6]); + hw_qp_rl_index->burst_size_prio_6 |= (qp_rl_index->burst_size[6] & 1); + + hw_qp_rl_index->rate_prio_7 = cpu_to_be32(qp_rl_index->rates[7]); + hw_qp_rl_index->burst_size_prio_7 |= (qp_rl_index->burst_size[7] & 1); + + err = mlx4_cmd(dev, mailbox_in_dma, inmod, + MLX4_ALLOCATE_RL_PROPERTIES, + MLX4_CMD_QP_RL_INDEX, + MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + mlx4_free_cmd_mailbox(dev, mailbox_in); + return err; +} +#endif + int __mlx4_clear_if_stat(struct mlx4_dev *dev, u8 counter_index) { @@ -3754,6 +4011,51 @@ return 0; } +#ifdef CONFIG_RATELIMIT +/* Parse the list of received prioroties and save the required priorities */ +u8 mlx4_parse_prios_for_rl(char *str, u8 *lst_of_prios, int max_num_prios) +{ + char *p; + size_t len; + size_t i; + int index; + u8 num_prios = 0; + + p = str; + len = strlen(p); + if (len == 0) { + *lst_of_prios |= 1; + num_prios++; + return num_prios; + } + for (i = 0; i < len; i++) { + if (p[i] == ',') { + p[i] = '\0'; + } + } + i = 0; + while (i < len) { + if (p[i] == '\0') { + i++; + continue; + } + index = (int)strtol(p + i, 0, 10); + if (index >= max_num_prios || index < 0) { + pr_warn("Priority number is not valid, using priority 0 as default\n"); + *lst_of_prios = 1; + num_prios = 1; + return num_prios; + } + i++; + if (*lst_of_prios & (1 << index)) + continue; + num_prios++; + *lst_of_prios |= (1 << index); + } + return num_prios; +} +#endif + static int __init mlx4_init(void) { int ret; Index: sys/ofed/drivers/net/mlx4/mlx4.h =================================================================== --- sys/ofed/drivers/net/mlx4/mlx4.h +++ sys/ofed/drivers/net/mlx4/mlx4.h @@ -806,6 +806,18 @@ struct mutex mutex; }; +#ifdef CONFIG_RATELIMIT +enum mlx4_qp_rlpp_opmod { + MLX4_ALLOCATE_RLPP_FOR_PORT, + MLX4_QUERY_RLPP_FOR_PORT, +}; + +enum mlx4_rl_index_opmod { + MLX4_ALLOCATE_RL_PROPERTIES, + MLX4_QUERY_RL_PROPERTIES, +}; +#endif + enum { MLX4_NO_RR = 0, MLX4_USE_RR = 1, Index: sys/ofed/drivers/net/mlx4/mlx4_en.h =================================================================== --- sys/ofed/drivers/net/mlx4/mlx4_en.h +++ sys/ofed/drivers/net/mlx4/mlx4_en.h @@ -113,6 +113,14 @@ MLX4_EN_ALLOC_REPLACEMENT = 1, }; +#ifdef CONFIG_RATELIMIT +enum mlx4_en_rl_operation { + MLX4_EN_RL_ADD = 0, + MLX4_EN_RL_DEL = 1, + MLX4_EN_RL_MOD = 2, +}; +#endif + /* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU * and 4K allocations) */ #if MJUMPAGESIZE == 4096 @@ -153,8 +161,16 @@ #define MLX4_EN_MAX_TX_RING_P_UP 32 #define MLX4_EN_NUM_UP 1 -#define MAX_TX_RINGS (MLX4_EN_MAX_TX_RING_P_UP * \ - MLX4_EN_NUM_UP) +#ifdef CONFIG_RATELIMIT +#define MLX4_EN_DEF_RL_TX_RING_SIZE 512 +#define MLX4_EN_DEF_RL_USER_PRIO 0 +#define MLX4_EN_DEF_MAX_RL_TX_RINGS 45000 +#else +#define MLX4_EN_DEF_MAX_RL_TX_RINGS 0 +#endif + +#define MAX_TX_RINGS ((MLX4_EN_MAX_TX_RING_P_UP * \ + (MLX4_EN_NUM_UP)) + MLX4_EN_DEF_MAX_RL_TX_RINGS) #define MLX4_EN_DEF_TX_RING_SIZE 1024 #define MLX4_EN_DEF_RX_RING_SIZE 1024 @@ -271,6 +287,14 @@ #define MLX4_EN_CX3_LOW_ID 0x1000 #define MLX4_EN_CX3_HIGH_ID 0x1005 +#ifdef CONFIG_RATELIMIT +struct mlx4_en_rl_data { + bool user_valid; + u8 rate_index; + struct sysctl_ctx_list rl_stats_ctx; +}; +#endif + struct mlx4_en_tx_ring { spinlock_t tx_lock; struct mlx4_hwq_resources wqres; @@ -309,6 +333,9 @@ int full_size; int inline_thold; u64 watchdog_time; +#ifdef CONFIG_RATELIMIT + struct mlx4_en_rl_data rl_data; +#endif }; struct mlx4_en_rx_desc { @@ -458,6 +485,10 @@ u8 mac_removed[MLX4_MAX_PORTS + 1]; unsigned long last_overflow_check; unsigned long overflow_period; +#ifdef CONFIG_RATELIMIT + u8 lst_of_prios; + u8 num_rl_prios; +#endif }; @@ -529,6 +560,27 @@ u16 frag_prefix_size; }; +#ifdef CONFIG_RATELIMIT +struct mlx4_en_reuse_index_list_element { + STAILQ_ENTRY(mlx4_en_reuse_index_list_element) entry; + int val; +}; + +struct mlx4_en_rl_task_list_element { + STAILQ_ENTRY(mlx4_en_rl_task_list_element) entry; + int ring_id; + enum mlx4_en_rl_operation operation; + u8 rate_index; +}; + +struct mlx4_en_rate_limit_indexes { + u32 rate; + u_int ref; + u8 burst_size; +}; + +extern int show_rl_sysctl_info; +#endif struct mlx4_en_priv { struct mlx4_en_dev *mdev; @@ -580,6 +632,10 @@ u8 num_tx_rings_p_up; u32 tx_ring_num; u32 rx_ring_num; +#ifdef CONFIG_RATELIMIT + u32 native_tx_ring_num; + u32 rate_limit_tx_ring_num; +#endif u32 rx_mb_size; struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; u16 rx_alloc_order; @@ -621,6 +677,9 @@ struct ifmedia media; volatile int blocked; struct sysctl_oid *sysctl; +#ifdef CONFIG_RATELIMIT + struct sysctl_oid *sysctl_stat; +#endif struct sysctl_ctx_list conf_ctx; struct sysctl_ctx_list stat_ctx; #define MLX4_EN_MAC_HASH_IDX 5 @@ -642,6 +701,20 @@ u64 if_counters_rx_errors; u64 if_counters_rx_no_buffer; +#ifdef CONFIG_RATELIMIT + spinlock_t tx_ring_index_lock; + spinlock_t rl_op_lock; + STAILQ_HEAD(, mlx4_en_reuse_index_list_element) reuse_index_list_head; + STAILQ_HEAD(, mlx4_en_rl_task_list_element) rl_op_list_head; + struct mlx4_en_reuse_index_list_element reuse_index_list_array [MAX_TX_RINGS]; + struct task rl_task; + struct taskqueue *rl_tq; + int num_rates_per_prio; + struct mlx4_en_rate_limit_indexes *rate_limits; + struct mutex rate_limit_table_lock; + int next_free_rl_index; + u32 new_rate_for_avail_index; +#endif }; enum mlx4_en_wol { @@ -802,6 +875,18 @@ int cq, int user_prio); void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring); +#ifdef CONFIG_RATELIMIT +int mlx4_en_create_rate_limit_ring(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req); +int mlx4_en_modify_rate_limit_ring(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req); +int mlx4_en_destroy_rate_limit_ring(struct mlx4_en_priv *priv, + struct ifreq_txrtlmt *rl_req); +void mlx4_en_async_rl_operation(void *context, int index); +void mlx4_en_rl_reused_index_insert(struct mlx4_en_priv *priv, uint32_t ring_id); +void mlx4_en_invalidate_rl_ring(struct mlx4_en_priv *priv, + uint32_t ring_id); +#endif void mlx4_en_qflush(struct ifnet *dev); int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, @@ -904,6 +989,11 @@ NETIF_MSG_WOL = 0x4000, }; +#ifdef CONFIG_RATELIMIT +#define TX_RING_USER_VALID(ring_index) \ + (priv->tx_ring[ring_index] && \ + priv->tx_ring[ring_index]->rl_data.user_valid) +#endif /* * printk / logging functions Index: sys/ofed/drivers/net/mlx4/qp.c =================================================================== --- sys/ofed/drivers/net/mlx4/qp.c +++ sys/ofed/drivers/net/mlx4/qp.c @@ -406,6 +406,68 @@ EXPORT_SYMBOL_GPL(mlx4_qp_alloc); +int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, + enum mlx4_update_qp_attr attr, + struct mlx4_update_qp_params *params) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_update_qp_context *cmd; + u64 pri_addr_path_mask = 0; + u64 qp_mask = 0; + int err = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + cmd = (struct mlx4_update_qp_context *)mailbox->buf; + + if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS)) + return -EINVAL; + + if (attr & MLX4_UPDATE_QP_SMAC) { + pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX; + cmd->qp_context.pri_path.grh_mylmc = params->smac_index; + } + + if (attr & MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB) { + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) { + mlx4_warn(dev, "Trying to set src check LB, but it isn't supported\n"); + err = -ENOTSUPP; + goto out; + } + pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB; + if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB) { + cmd->qp_context.pri_path.fl |= + MLX4_FL_ETH_SRC_CHECK_MC_LB; + } + } + + if (attr & MLX4_UPDATE_QP_VSD) { + qp_mask |= 1ULL << MLX4_UPD_QP_MASK_VSD; + if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE) + cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN); + } + +#ifdef CONFIG_RATELIMIT + if (attr & MLX4_UPDATE_QP_RATE_LIMIT) { + qp_mask |= 1ULL << MLX4_UPD_QP_MASK_RATE_LIMIT; + cmd->qp_context.rate_limit_index = params->rl_index; + } +#endif + + cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask); + cmd->qp_mask = cpu_to_be64(qp_mask); + + err = mlx4_cmd(dev, mailbox->dma, qpn & 0xffffff, 0, + MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp) { struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; Index: sys/ofed/include/linux/mlx4/cmd.h =================================================================== --- sys/ofed/include/linux/mlx4/cmd.h +++ sys/ofed/include/linux/mlx4/cmd.h @@ -158,6 +158,11 @@ MLX4_QP_FLOW_STEERING_ATTACH = 0x65, MLX4_QP_FLOW_STEERING_DETACH = 0x66, MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64, + +#ifdef CONFIG_RATELIMIT + MLX4_CMD_QP_RLPP = 0x83, + MLX4_CMD_QP_RL_INDEX = 0x84, +#endif }; enum { Index: sys/ofed/include/linux/mlx4/device.h =================================================================== --- sys/ofed/include/linux/mlx4/device.h +++ sys/ofed/include/linux/mlx4/device.h @@ -64,6 +64,10 @@ #define CORE_CLOCK_MASK 0xffffffffffffULL +#ifdef CONFIG_RATELIMIT +#define MLX4_NUM_PRIOS_TOTAL 15 +#endif + enum { MLX4_FLAG_MSI_X = 1 << 0, MLX4_FLAG_OLD_PORT_CMDS = 1 << 1, @@ -427,6 +431,20 @@ u32 base_tunnel_sqpn; }; +#ifdef CONFIG_RATELIMIT +struct mlx4_rate_limit_caps { + u8 enable; + u8 max_unit; + u8 min_unit; + u16 max_val; + u16 min_val; + u32 calc_max_val; + u32 calc_min_val; + char min_val_str[32]; + char max_val_str[32]; +}; +#endif + struct mlx4_caps { u64 fw_ver; u32 function; @@ -524,8 +542,21 @@ u32 max_basic_counters; u32 max_extended_counters; u8 def_counter_index[MLX4_MAX_PORTS + 1]; +#ifdef CONFIG_RATELIMIT + struct mlx4_rate_limit_caps rl_caps; + int fw_reserved_qp_base; +#endif }; +#ifdef CONFIG_RATELIMIT +enum { /* rl */ + MLX4_QP_RATE_LIMIT_RESERVED = 0x0, + MLX4_QP_RATE_LIMIT_KBPS = 0x1, + MLX4_QP_RATE_LIMIT_MBPS = 0x2, + MLX4_QP_RATE_LIMIT_GBPS = 0x3, +}; +#endif + struct mlx4_buf_list { void *buf; dma_addr_t map; @@ -790,6 +821,9 @@ int oper_log_mgm_entry_size; u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; +#ifdef CONFIG_RATELIMIT + struct sysctl_ctx_list rl_ctx; +#endif }; struct mlx4_clock_params { @@ -1233,6 +1267,111 @@ }; }; +#ifdef CONFIG_RATELIMIT +struct mlx4_hw_num_of_rates { + __be16 reserved1; + __be16 available_RPP; + u8 reserved2[3]; + u8 RPP_prio_0; + u8 reserved3[3]; + u8 RPP_prio_1; + u8 reserved4[3]; + u8 RPP_prio_2; + u8 reserved5[3]; + u8 RPP_prio_3; + u8 reserved6[3]; + u8 RPP_prio_4; + u8 reserved7[3]; + u8 RPP_prio_5; + u8 reserved8[3]; + u8 RPP_prio_6; + u8 reserved9[3]; + u8 RPP_prio_7; + /* For IB Prios */ + u8 reserved10[3]; + u8 RPP_prio_8; + u8 reserved11[3]; + u8 RPP_prio_9; + u8 reserved12[3]; + u8 RPP_prio_10; + u8 reserved13[3]; + u8 RPP_prio_11; + u8 reserved14[3]; + u8 RPP_prio_12; + u8 reserved15[3]; + u8 RPP_prio_13; + u8 reserved16[3]; + u8 RPP_prio_14; + u8 reserved17[3]; + u8 RPP_prio_15; + __be32 base_qp_num; +}; + +struct mlx4_num_of_rates { + u16 available_RPP; + u8 RPP_per_prio[MLX4_NUM_PRIOS_TOTAL]; + u32 base_qp_num; +}; + +struct mlx4_hw_qp_rl_index { + __be32 reserved1; + __be32 rate_prio_0; + u8 reserved2[3]; + u8 burst_size_prio_0; + __be32 rate_prio_1; + u8 reserved3[3]; + u8 burst_size_prio_1; + __be32 rate_prio_2; + u8 reserved4[3]; + u8 burst_size_prio_2; + __be32 rate_prio_3; + u8 reserved5[3]; + u8 burst_size_prio_3; + __be32 rate_prio_4; + u8 reserved6[3]; + u8 burst_size_prio_4; + __be32 rate_prio_5; + u8 reserved7[3]; + u8 burst_size_prio_5; + __be32 rate_prio_6; + u8 reserved8[3]; + u8 burst_size_prio_6; + __be32 rate_prio_7; + u8 reserved9[3]; + u8 burst_size_prio_7; + /* For IB Prios */ + __be32 rate_prio_8; + u8 reserved10[3]; + u8 burst_size_prio_8; + __be32 rate_prio_9; + u8 reserved11[3]; + u8 burst_size_prio_9; + __be32 rate_prio_10; + u8 reserved12[3]; + u8 burst_size_prio_10; + __be32 rate_prio_11; + u8 reserved13[3]; + u8 burst_size_prio_11; + __be32 rate_prio_12; + u8 reserved14[3]; + u8 burst_size_prio_12; + __be32 rate_prio_13; + u8 reserved15[3]; + u8 burst_size_prio_13; + __be32 rate_prio_14; + u8 reserved16[3]; + u8 burst_size_prio_14; + __be32 rate_prio_15; + u8 reserved17[3]; + u8 burst_size_prio_15; +}; + +struct mlx4_qp_rl_index { + u32 rates[MLX4_NUM_PRIOS_TOTAL]; + u8 burst_size[MLX4_NUM_PRIOS_TOTAL]; +}; +#endif + int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn, enum mlx4_net_trans_promisc_mode mode); int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port, @@ -1271,6 +1410,15 @@ u8 op_modifier, u32 in_offset[], u32 counter_out[]); +#ifdef CONFIG_RATELIMIT +int mlx4_query_rl_fw_resources(struct mlx4_dev *dev, u8 port, u16 *available_RLPP); +int mlx4_alloc_rl_fw_resources(struct mlx4_dev *dev, u8 port, + struct mlx4_num_of_rates *all_num_rates); +int mlx4_set_rates_and_burst_size(struct mlx4_dev *dev, u8 port, u8 index, + struct mlx4_qp_rl_index *qp_rl_index); +u8 mlx4_parse_prios_for_rl(char *str, u8 *lst_of_prios, int max_num_prios); +#endif + int mlx4_test_interrupts(struct mlx4_dev *dev); int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector); void mlx4_release_eq(struct mlx4_dev *dev, int vec); Index: sys/ofed/include/linux/mlx4/qp.h =================================================================== --- sys/ofed/include/linux/mlx4/qp.h +++ sys/ofed/include/linux/mlx4/qp.h @@ -182,6 +182,10 @@ MLX4_FVL_RX_FORCE_ETH_VLAN = 1 << 0 /* enforce Eth rx vlan */ }; +enum { + MLX4_DISABLE_WQE_PREFETCH = (1 << 7) +}; + struct mlx4_qp_context { __be32 flags; __be32 pd; @@ -218,7 +222,14 @@ u8 reserved4[2]; u8 mtt_base_addr_h; __be32 mtt_base_addr_l; +#ifdef CONFIG_RATELIMIT + u32 reserved5[2]; + u8 reserved6[3]; + u8 rate_limit_index; + u32 reserved7[7]; +#else u32 reserved5[10]; +#endif }; struct mlx4_update_qp_context { @@ -233,6 +244,9 @@ enum { MLX4_UPD_QP_MASK_PM_STATE = 32, MLX4_UPD_QP_MASK_VSD = 33, +#ifdef CONFIG_RATELIMIT + MLX4_UPD_QP_MASK_RATE_LIMIT = 39, +#endif }; enum { @@ -425,6 +439,35 @@ __be32 byte_count; }; +enum mlx4_update_qp_attr { + MLX4_UPDATE_QP_SMAC = 1 << 0, + MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB = 1 << 1, + MLX4_UPDATE_QP_VSD = 1 << 2, +#ifdef CONFIG_RATELIMIT + MLX4_UPDATE_QP_RATE_LIMIT = 1 << 3, + MLX4_UPDATE_QP_SUPPORTED_ATTRS = (1 << 4) - 1 +#else + MLX4_UPDATE_QP_SUPPORTED_ATTRS = (1 << 3) - 1 +#endif +}; + +enum mlx4_update_qp_params_flags { + MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB = 1 << 0, + MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE = 1 << 1, +}; + +struct mlx4_update_qp_params { + u8 smac_index; + u32 flags; +#ifdef CONFIG_RATELIMIT + u8 rl_index; +#endif +}; + +int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, + enum mlx4_update_qp_attr attr, + struct mlx4_update_qp_params *params); + int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,