Commit 1060f865 authored by Parav Pandit's avatar Parav Pandit Committed by Jason Gunthorpe
Browse files

IB/{core/cm}: Fix generating a return AH for RoCEE

When computing a UD reverse path (return AH) from a WC the code was not
doing a route lookup anchored in a specific netdevice. This caused several
bugs, including broken IPv6 link-local address support in RoCEv2. [1]

This fixes the lookup by determining the GID table entry that the HW
matched to the SGID for the WC and then using the netdevice from that
entry to perform the route and ND lookup for the 'DGID' to build a return
AH.

RoCE GID table management ensures that right upper netdevices of the
physical netdevices are added. Therefore init_ah_from_wc doesn't need to
perform such check.

Now that route lookup is done based on the netdevice of the GID entry,
simplify code to not have ifindex and vlan pointers.  As part of that,
refactor to have netdevice as input parameter.  This is already discussed
at [2].

Finally ib_init_ah_from_wc resolves dmac for unicast GID in similar way as
what ib_resolve_eth_dmac() does. So ib_resolve_eth_dmac is refactored to
split for unicast and non unicast GIDs, so that it can be reused by
ib_init_ah_from_wc.

While we are at refactoring ib_resolve_eth_dmac(), it is further
simplified

(a) to avoid hoplimit as optional parameter, as there is only one
    user who always queries hoplimit.
(b) for empty line.
(c) avoided zero initialization of ret.
(d) removed as exported symbol as only ib core uses it.

For IPv6, this is tested using simple rping test as below.
 rping -sv -a ::0
 rping -c -a fe80::268a:7ff:fe55:4661%ens2f1 -C 1 -v -d

[1] https://www.spinics.net/lists/linux-rdma/msg45690.html
[2] https://www.spinics.net/lists/linux-rdma/msg45710.html

Signed-off-by: default avatarParav Pandit <parav@mellanox.com>
Reviewed-by: default avatarMatan Barak <matanb@mellanox.com>
Reviewed-by: default avatarMark Bloch <markb@mellanox.com>
Reported-by: default avatarRoland Dreier <roland@purestorage.com>
Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent 8b0c05dc
......@@ -761,27 +761,23 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
const union ib_gid *dgid,
u8 *dmac, u16 *vlan_id, int *if_index,
u8 *dmac, const struct net_device *ndev,
int *hoplimit)
{
int ret = 0;
struct rdma_dev_addr dev_addr;
struct resolve_cb_context ctx;
struct net_device *dev;
union {
struct sockaddr _sockaddr;
struct sockaddr_in _sockaddr_in;
struct sockaddr_in6 _sockaddr_in6;
} sgid_addr, dgid_addr;
int ret;
rdma_gid2ip(&sgid_addr._sockaddr, sgid);
rdma_gid2ip(&dgid_addr._sockaddr, dgid);
memset(&dev_addr, 0, sizeof(dev_addr));
if (if_index)
dev_addr.bound_dev_if = *if_index;
dev_addr.bound_dev_if = ndev->ifindex;
dev_addr.net = &init_net;
ctx.addr = &dev_addr;
......@@ -798,19 +794,9 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
return ret;
memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
if (!dev)
return -ENODEV;
if (if_index)
*if_index = dev_addr.bound_dev_if;
if (vlan_id)
*vlan_id = rdma_vlan_dev_vlan_id(dev);
if (hoplimit)
*hoplimit = dev_addr.hoplimit;
dev_put(dev);
return ret;
*hoplimit = dev_addr.hoplimit;
return 0;
}
EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh);
int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
{
......
......@@ -481,6 +481,40 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
}
EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
/* Resolve destination mac address and hop limit for unicast destination
* GID entry, considering the source GID entry as well.
* ah_attribute must have have valid port_num, sgid_index.
*/
static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
struct rdma_ah_attr *ah_attr)
{
struct ib_gid_attr sgid_attr;
struct ib_global_route *grh;
int hop_limit = 0xff;
union ib_gid sgid;
int ret;
grh = rdma_ah_retrieve_grh(ah_attr);
ret = ib_query_gid(device,
rdma_ah_get_port_num(ah_attr),
grh->sgid_index,
&sgid, &sgid_attr);
if (ret || !sgid_attr.ndev) {
if (!ret)
ret = -ENXIO;
return ret;
}
ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid,
ah_attr->roce.dmac,
sgid_attr.ndev, &hop_limit);
dev_put(sgid_attr.ndev);
grh->hop_limit = hop_limit;
return ret;
}
/*
* This function creates ah from the incoming packet.
* Incoming packet has dgid of the receiver node on which this code is
......@@ -490,9 +524,6 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
* as sgid and, sgid is used as dgid because sgid contains destinations
* GID whom to respond to.
*
* This is why when calling rdma_addr_find_l2_eth_by_grh() function, the
* position of arguments dgid and sgid do not match the order of the
* parameters.
*/
int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
const struct ib_wc *wc, const struct ib_grh *grh,
......@@ -523,57 +554,33 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
if (ret)
return ret;
rdma_ah_set_sl(ah_attr, wc->sl);
rdma_ah_set_port_num(ah_attr, port_num);
if (rdma_protocol_roce(device, port_num)) {
int if_index = 0;
u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
wc->vlan_id : 0xffff;
struct net_device *idev;
struct net_device *resolved_dev;
if (!(wc->wc_flags & IB_WC_GRH))
return -EPROTOTYPE;
if (!device->get_netdev)
return -EOPNOTSUPP;
idev = device->get_netdev(device, port_num);
if (!idev)
return -ENODEV;
ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
ah_attr->roce.dmac,
wc->wc_flags & IB_WC_WITH_VLAN ?
NULL : &vlan_id,
&if_index, &hoplimit);
if (ret) {
dev_put(idev);
return ret;
}
resolved_dev = dev_get_by_index(&init_net, if_index);
rcu_read_lock();
if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
resolved_dev))
ret = -EHOSTUNREACH;
rcu_read_unlock();
dev_put(idev);
dev_put(resolved_dev);
if (ret)
return ret;
ret = get_sgid_index_from_eth(device, port_num, vlan_id,
&dgid, gid_type, &gid_index);
ret = get_sgid_index_from_eth(device, port_num,
vlan_id, &dgid,
gid_type, &gid_index);
if (ret)
return ret;
}
rdma_ah_set_dlid(ah_attr, wc->slid);
rdma_ah_set_sl(ah_attr, wc->sl);
rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
rdma_ah_set_port_num(ah_attr, port_num);
flow_class = be32_to_cpu(grh->version_tclass_flow);
rdma_ah_set_grh(ah_attr, &sgid,
flow_class & 0xFFFFF,
(u8)gid_index, hoplimit,
(flow_class >> 20) & 0xFF);
return ib_resolve_unicast_gid_dmac(device, ah_attr);
} else {
rdma_ah_set_dlid(ah_attr, wc->slid);
rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
if (wc->wc_flags & IB_WC_GRH) {
if (!rdma_cap_eth_ah(device, port_num)) {
if (wc->wc_flags & IB_WC_GRH) {
if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
ret = ib_find_cached_gid_by_port(device, &dgid,
IB_GID_TYPE_IB,
......@@ -584,16 +591,15 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
} else {
gid_index = 0;
}
}
flow_class = be32_to_cpu(grh->version_tclass_flow);
rdma_ah_set_grh(ah_attr, &sgid,
flow_class & 0xFFFFF,
(u8)gid_index, hoplimit,
(flow_class >> 20) & 0xFF);
flow_class = be32_to_cpu(grh->version_tclass_flow);
rdma_ah_set_grh(ah_attr, &sgid,
flow_class & 0xFFFFF,
(u8)gid_index, hoplimit,
(flow_class >> 20) & 0xFF);
}
return 0;
}
return 0;
}
EXPORT_SYMBOL(ib_init_ah_from_wc);
......@@ -1290,34 +1296,8 @@ static int ib_resolve_eth_dmac(struct ib_device *device,
(char *)ah_attr->roce.dmac);
}
} else {
union ib_gid sgid;
struct ib_gid_attr sgid_attr;
int ifindex;
int hop_limit;
ret = ib_query_gid(device,
rdma_ah_get_port_num(ah_attr),
grh->sgid_index,
&sgid, &sgid_attr);
if (ret || !sgid_attr.ndev) {
if (!ret)
ret = -ENXIO;
goto out;
}
ifindex = sgid_attr.ndev->ifindex;
ret =
rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid,
ah_attr->roce.dmac,
NULL, &ifindex, &hop_limit);
dev_put(sgid_attr.ndev);
grh->hop_limit = hop_limit;
ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
}
out:
return ret;
}
......
......@@ -134,7 +134,7 @@ int rdma_addr_size(struct sockaddr *addr);
int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
const union ib_gid *dgid,
u8 *smac, u16 *vlan_id, int *if_index,
u8 *dmac, const struct net_device *ndev,
int *hoplimit);
static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment