Skip to content

Commit

Permalink
Merge branch 'net-nexthop-increase-weight-to-u16'
Browse files Browse the repository at this point in the history
Petr Machata says:

====================
net: nexthop: Increase weight to u16

In CLOS networks, as link failures occur at various points in the network,
ECMP weights of the involved nodes are adjusted to compensate. With high
fan-out of the involved nodes, and overall high number of nodes,
a (non-)ECMP weight ratio that we would like to configure does not fit into
8 bits. Instead of, say, 255:254, we might like to configure something like
1000:999. For these deployments, the 8-bit weight may not be enough.

To that end, in this patchset increase the next hop weight from u8 to u16.

Patch #1 adds a flag that indicates whether the reserved fields are zeroed.
This is a follow-up to a new fix merged in commit 6d745cd ("net:
nexthop: Initialize all fields in dumped nexthops"). The theory behind this
patch is that there is a strict ordering between the fields actually being
zeroed, the kernel declaring that they are, and the kernel repurposing the
fields. Thus clients can use the flag to tell if it is safe to interpret
the reserved fields in any way.

Patch #2 contains the substantial code and the commit message covers the
details of the changes.

Patches #3 to #6 add selftests.
====================

Link: https://patch.msgid.link/cover.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
kuba-moo committed Aug 13, 2024
2 parents 246ef40 + 4b808f4 commit e96f6fd
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 37 deletions.
4 changes: 2 additions & 2 deletions include/net/nexthop.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct nh_grp_entry_stats {
struct nh_grp_entry {
struct nexthop *nh;
struct nh_grp_entry_stats __percpu *stats;
u8 weight;
u16 weight;

union {
struct {
Expand Down Expand Up @@ -192,7 +192,7 @@ struct nh_notifier_single_info {
};

struct nh_notifier_grp_entry_info {
u8 weight;
u16 weight;
struct nh_notifier_single_info nh;
};

Expand Down
10 changes: 9 additions & 1 deletion include/uapi/linux/nexthop.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@ struct nhmsg {
struct nexthop_grp {
__u32 id; /* nexthop id - must exist */
__u8 weight; /* weight of this nexthop */
__u8 resvd1;
__u8 weight_high; /* high order bits of weight */
__u16 resvd2;
};

static inline __u16 nexthop_grp_weight(const struct nexthop_grp *entry)
{
return ((entry->weight_high << 8) | entry->weight) + 1;
}

enum {
NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group
* default type if not specified
Expand All @@ -33,6 +38,9 @@ enum {
#define NHA_OP_FLAG_DUMP_STATS BIT(0)
#define NHA_OP_FLAG_DUMP_HW_STATS BIT(1)

/* Response OP_FLAGS. */
#define NHA_OP_FLAG_RESP_GRP_RESVD_0 BIT(31) /* Dump clears resvd fields. */

enum {
NHA_UNSPEC,
NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */
Expand Down
49 changes: 32 additions & 17 deletions net/ipv4/nexthop.c
Original file line number Diff line number Diff line change
Expand Up @@ -865,15 +865,18 @@ static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
}

static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
u32 op_flags)
u32 op_flags, u32 *resp_op_flags)
{
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
struct nexthop_grp *p;
size_t len = nhg->num_nh * sizeof(*p);
struct nlattr *nla;
u16 group_type = 0;
u16 weight;
int i;

*resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;

if (nhg->hash_threshold)
group_type = NEXTHOP_GRP_TYPE_MPATH;
else if (nhg->resilient)
Expand All @@ -888,9 +891,12 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,

p = nla_data(nla);
for (i = 0; i < nhg->num_nh; ++i) {
weight = nhg->nh_entries[i].weight - 1;

*p++ = (struct nexthop_grp) {
.id = nhg->nh_entries[i].nh->id,
.weight = nhg->nh_entries[i].weight - 1,
.weight = weight,
.weight_high = weight >> 8,
};
}

Expand Down Expand Up @@ -934,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,

if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
u32 resp_op_flags = 0;

if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
goto nla_put_failure;
if (nla_put_nh_group(skb, nh, op_flags))
if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
goto nla_put_failure;
goto out;
}
Expand Down Expand Up @@ -1050,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
sz += nla_total_size(4); /* NHA_ID */

if (nh->is_group)
sz += nh_nlmsg_size_grp(nh);
sz += nh_nlmsg_size_grp(nh) +
nla_total_size(4) + /* NHA_OP_FLAGS */
0;
else
sz += nh_nlmsg_size_single(nh);

Expand Down Expand Up @@ -1280,11 +1290,14 @@ static int nh_check_attr_group(struct net *net,

nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
if (nhg[i].resvd1 || nhg[i].resvd2) {
NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
if (nhg[i].resvd2) {
NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
return -EINVAL;
}
if (nhg[i].weight > 254) {
if (nexthop_grp_weight(&nhg[i]) == 0) {
/* 0xffff got passed in, representing weight of 0x10000,
* which is too heavy.
*/
NL_SET_ERR_MSG(extack, "Invalid value for weight");
return -EINVAL;
}
Expand Down Expand Up @@ -1880,9 +1893,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
static void nh_res_group_rebalance(struct nh_group *nhg,
struct nh_res_table *res_table)
{
int prev_upper_bound = 0;
int total = 0;
int w = 0;
u16 prev_upper_bound = 0;
u32 total = 0;
u32 w = 0;
int i;

INIT_LIST_HEAD(&res_table->uw_nh_entries);
Expand All @@ -1892,11 +1905,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg,

for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
int upper_bound;
u16 upper_bound;
u64 btw;

w += nhge->weight;
upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
total);
btw = ((u64)res_table->num_nh_buckets) * w;
upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
nhge->res.wants_buckets = upper_bound - prev_upper_bound;
prev_upper_bound = upper_bound;

Expand Down Expand Up @@ -1962,16 +1976,16 @@ static void replace_nexthop_grp_res(struct nh_group *oldg,

static void nh_hthr_group_rebalance(struct nh_group *nhg)
{
int total = 0;
int w = 0;
u32 total = 0;
u32 w = 0;
int i;

for (i = 0; i < nhg->num_nh; ++i)
total += nhg->nh_entries[i].weight;

for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
int upper_bound;
u32 upper_bound;

w += nhge->weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
Expand Down Expand Up @@ -2713,7 +2727,8 @@ static struct nexthop *nexthop_create_group(struct net *net,
goto out_no_nh;
}
nhg->nh_entries[i].nh = nhe;
nhg->nh_entries[i].weight = entry[i].weight + 1;
nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);

list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
nhg->nh_entries[i].nh_parent = nh;
}
Expand Down
55 changes: 54 additions & 1 deletion tools/testing/selftests/net/fib_nexthops.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,12 @@ log_test()
else
ret=1
nfail=$((nfail+1))
printf "TEST: %-60s [FAIL]\n" "${msg}"
if [[ $rc -eq $ksft_skip ]]; then
printf "TEST: %-60s [SKIP]\n" "${msg}"
else
printf "TEST: %-60s [FAIL]\n" "${msg}"
fi

if [ "$VERBOSE" = "1" ]; then
echo " rc=$rc, expected $expected"
fi
Expand Down Expand Up @@ -923,6 +928,29 @@ ipv6_grp_fcnal()

ipv6_grp_refs
log_test $? 0 "Nexthop group replace refcounts"

#
# 16-bit weights.
#
run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
run_cmd "$IP nexthop add id 66 dev veth1"

run_cmd "$IP nexthop add id 103 group 62,1000"
if [[ $? == 0 ]]; then
local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535"
run_cmd "$IP nexthop replace $GRP"
check_nexthop "id 103" "$GRP"
rc=$?
else
rc=$ksft_skip
fi

$IP nexthop flush >/dev/null 2>&1

log_test $rc 0 "16-bit weights"
}

ipv6_res_grp_fcnal()
Expand Down Expand Up @@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal()
check_nexthop_bucket "list id 102" \
"id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62"
log_test $? 0 "Nexthop buckets updated after replace - nECMP"

#
# 16-bit weights.
#
run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
run_cmd "$IP nexthop add id 66 dev veth1"

run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32"
if [[ $? == 0 ]]; then
local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(:
)type resilient buckets 32 idle_timer 0 $(:
)unbalanced_timer 0"
run_cmd "$IP nexthop replace $GRP"
check_nexthop "id 103" "$GRP unbalanced_time 0"
rc=$?
else
rc=$ksft_skip
fi

$IP nexthop flush >/dev/null 2>&1

log_test $rc 0 "16-bit weights"
}

ipv6_fcnal_runtime()
Expand Down
7 changes: 7 additions & 0 deletions tools/testing/selftests/net/forwarding/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,13 @@ xfail_on_slow()
fi
}

omit_on_slow()
{
if [[ $KSFT_MACHINE_SLOW != yes ]]; then
"$@"
fi
}

xfail_on_veth()
{
local dev=$1; shift
Expand Down
40 changes: 32 additions & 8 deletions tools/testing/selftests/net/forwarding/router_mpath_nh.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ ALL_TESTS="
ping_ipv4
ping_ipv6
multipath_test
multipath16_test
ping_ipv4_blackhole
ping_ipv6_blackhole
nh_stats_test_v4
Expand Down Expand Up @@ -226,9 +227,11 @@ routing_nh_obj()

multipath4_test()
{
local desc="$1"
local weight_rp12=$2
local weight_rp13=$3
local desc=$1; shift
local weight_rp12=$1; shift
local weight_rp13=$1; shift
local ports=${1-sp=1024,dp=0-32768}; shift

local t0_rp12 t0_rp13 t1_rp12 t1_rp13
local packets_rp12 packets_rp13

Expand All @@ -242,7 +245,8 @@ multipath4_test()
t0_rp13=$(link_stats_tx_packets_get $rp13)

ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
-d $MZ_DELAY -t udp "$ports"
sleep 1

t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
Expand All @@ -258,9 +262,11 @@ multipath4_test()

multipath6_test()
{
local desc="$1"
local weight_rp12=$2
local weight_rp13=$3
local desc=$1; shift
local weight_rp12=$1; shift
local weight_rp13=$1; shift
local ports=${1-sp=1024,dp=0-32768}; shift

local t0_rp12 t0_rp13 t1_rp12 t1_rp13
local packets_rp12 packets_rp13

Expand All @@ -275,7 +281,8 @@ multipath6_test()
t0_rp13=$(link_stats_tx_packets_get $rp13)

$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
-d $MZ_DELAY -t udp "$ports"
sleep 1

t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
Expand Down Expand Up @@ -313,6 +320,23 @@ multipath_test()
multipath6_test "Weighted MP 11:45" 11 45
}

multipath16_test()
{
check_nhgw16 104 || return

log_info "Running 16-bit IPv4 multipath tests"
multipath4_test "65535:65535" 65535 65535
multipath4_test "128:512" 128 512
omit_on_slow \
multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535

log_info "Running 16-bit IPv6 multipath tests"
multipath6_test "65535:65535" 65535 65535
multipath6_test "128:512" 128 512
omit_on_slow \
multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
}

ping_ipv4_blackhole()
{
RET=0
Expand Down
13 changes: 13 additions & 0 deletions tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,16 @@ __nh_stats_test_v6()
$MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2
sysctl_restore net.ipv6.fib_multipath_hash_policy
}

check_nhgw16()
{
local nhid=$1; shift

ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null
if (( $? )); then
log_test_skip "16-bit multipath tests" \
"iproute2 or the kernel do not support 16-bit next hop weights"
return 1
fi
ip nexthop del id 9999 ||:
}
Loading

0 comments on commit e96f6fd

Please sign in to comment.