Skip to content

Commit

Permalink
RDMA/mlx5: Change the cache structure to an RB-tree
Browse files Browse the repository at this point in the history
Currently, the cache structure is a static linear array. Therefore, his
size is limited to the number of entries in it and is not expandable.  The
entries are dedicated to mkeys of size 2^x and no access_flags. Mkeys with
different properties are not cacheable.

In this patch, we change the cache structure to an RB-tree.  This will
allow to extend the cache to support more entries with different mkey
properties.

Link: https://lore.kernel.org/r/20230125222807.6921-4-michaelgur@nvidia.com
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
  • Loading branch information
mikijoy authored and jgunthorpe committed Jan 27, 2023
1 parent 18b1746 commit b958451
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 47 deletions.
11 changes: 9 additions & 2 deletions drivers/infiniband/hw/mlx5/mlx5_ib.h
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,8 @@ struct mlx5_cache_ent {
u32 access_mode;
unsigned int ndescs;

struct rb_node node;

u8 disabled:1;
u8 fill_to_high_water:1;

Expand Down Expand Up @@ -770,8 +772,9 @@ struct mlx5r_async_create_mkey {

struct mlx5_mkey_cache {
struct workqueue_struct *wq;
struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES];
struct dentry *root;
struct rb_root rb_root;
struct mutex rb_lock;
struct dentry *fs_root;
unsigned long last_add;
};

Expand Down Expand Up @@ -1316,11 +1319,15 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
int order);

struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
struct mlx5_cache_ent *ent,
int access_flags);

struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev, u32 order,
int access_flags);
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
Expand Down
160 changes: 120 additions & 40 deletions drivers/infiniband/hw/mlx5/mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -515,18 +515,22 @@ static const struct file_operations limit_fops = {

static bool someone_adding(struct mlx5_mkey_cache *cache)
{
unsigned int i;

for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
struct mlx5_cache_ent *ent = &cache->ent[i];
bool ret;
struct mlx5_cache_ent *ent;
struct rb_node *node;
bool ret;

mutex_lock(&cache->rb_lock);
for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
xa_lock_irq(&ent->mkeys);
ret = ent->stored < ent->limit;
xa_unlock_irq(&ent->mkeys);
if (ret)
if (ret) {
mutex_unlock(&cache->rb_lock);
return true;
}
}
mutex_unlock(&cache->rb_lock);
return false;
}

Expand Down Expand Up @@ -637,6 +641,59 @@ static void delayed_cache_work_func(struct work_struct *work)
__cache_work_func(ent);
}

static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
struct mlx5_cache_ent *ent)
{
struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
struct mlx5_cache_ent *cur;

mutex_lock(&cache->rb_lock);
/* Figure out where to put new node */
while (*new) {
cur = rb_entry(*new, struct mlx5_cache_ent, node);
parent = *new;
if (ent->order < cur->order)
new = &((*new)->rb_left);
if (ent->order > cur->order)
new = &((*new)->rb_right);
if (ent->order == cur->order) {
mutex_unlock(&cache->rb_lock);
return -EEXIST;
}
}

/* Add new node and rebalance tree. */
rb_link_node(&ent->node, parent, new);
rb_insert_color(&ent->node, &cache->rb_root);

mutex_unlock(&cache->rb_lock);
return 0;
}

static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
unsigned int order)
{
struct rb_node *node = dev->cache.rb_root.rb_node;
struct mlx5_cache_ent *cur, *smallest = NULL;

/*
* Find the smallest ent with order >= requested_order.
*/
while (node) {
cur = rb_entry(node, struct mlx5_cache_ent, node);
if (cur->order > order) {
smallest = cur;
node = node->rb_left;
}
if (cur->order < order)
node = node->rb_right;
if (cur->order == order)
return cur;
}

return smallest;
}

struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
struct mlx5_cache_ent *ent,
int access_flags)
Expand Down Expand Up @@ -677,10 +734,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
return mr;
}

static void clean_keys(struct mlx5_ib_dev *dev, int c)
struct mlx5_ib_mr *mlx5_mr_cache_alloc_order(struct mlx5_ib_dev *dev,
u32 order, int access_flags)
{
struct mlx5_cache_ent *ent = mkey_cache_ent_from_order(dev, order);

return mlx5_mr_cache_alloc(dev, ent, access_flags);
}

static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
{
struct mlx5_mkey_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent = &cache->ent[c];
u32 mkey;

cancel_delayed_work(&ent->dwork);
Expand All @@ -699,8 +762,8 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
if (!mlx5_debugfs_root || dev->is_rep)
return;

debugfs_remove_recursive(dev->cache.root);
dev->cache.root = NULL;
debugfs_remove_recursive(dev->cache.fs_root);
dev->cache.fs_root = NULL;
}

static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
Expand All @@ -713,12 +776,13 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
if (!mlx5_debugfs_root || dev->is_rep)
return;

cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
dir = mlx5_debugfs_get_dev_root(dev->mdev);
cache->fs_root = debugfs_create_dir("mr_cache", dir);

for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
ent = &cache->ent[i];
ent = mkey_cache_ent_from_order(dev, i);
sprintf(ent->name, "%d", ent->order);
dir = debugfs_create_dir(ent->name, cache->root);
dir = debugfs_create_dir(ent->name, cache->fs_root);
debugfs_create_file("size", 0600, dir, ent, &size_fops);
debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
debugfs_create_ulong("cur", 0400, dir, &ent->stored);
Expand All @@ -733,13 +797,39 @@ static void delay_time_func(struct timer_list *t)
WRITE_ONCE(dev->fill_delay, 0);
}

struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
int order)
{
struct mlx5_cache_ent *ent;
int ret;

ent = kzalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return ERR_PTR(-ENOMEM);

xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
ent->order = order;
ent->dev = dev;

INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);

ret = mlx5_cache_ent_insert(&dev->cache, ent);
if (ret) {
kfree(ent);
return ERR_PTR(ret);
}
return ent;
}

int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
{
struct mlx5_mkey_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent;
int i;

mutex_init(&dev->slow_path_mutex);
mutex_init(&dev->cache.rb_lock);
dev->cache.rb_root = RB_ROOT;
cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
if (!cache->wq) {
mlx5_ib_warn(dev, "failed to create work queue\n");
Expand All @@ -749,13 +839,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
timer_setup(&dev->delay_timer, delay_time_func, 0);
for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
ent = &cache->ent[i];
xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
ent->order = i + 2;
ent->dev = dev;
ent->limit = 0;

INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
ent = mlx5r_cache_create_ent(dev, i);

if (i > MKEY_CACHE_LAST_STD_ENTRY) {
mlx5_odp_init_mkey_cache_entry(ent);
Expand Down Expand Up @@ -785,14 +869,16 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)

int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
{
unsigned int i;
struct rb_root *root = &dev->cache.rb_root;
struct mlx5_cache_ent *ent;
struct rb_node *node;

if (!dev->cache.wq)
return 0;

for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
struct mlx5_cache_ent *ent = &dev->cache.ent[i];

mutex_lock(&dev->cache.rb_lock);
for (node = rb_first(root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
xa_lock_irq(&ent->mkeys);
ent->disabled = true;
xa_unlock_irq(&ent->mkeys);
Expand All @@ -802,8 +888,15 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
mlx5_mkey_cache_debugfs_cleanup(dev);
mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);

for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
clean_keys(dev, i);
node = rb_first(root);
while (node) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
node = rb_next(node);
clean_keys(dev, ent);
rb_erase(&ent->node, root);
kfree(ent);
}
mutex_unlock(&dev->cache.rb_lock);

destroy_workqueue(dev->cache.wq);
del_timer_sync(&dev->delay_timer);
Expand Down Expand Up @@ -876,19 +969,6 @@ static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
return MLX5_MAX_UMR_SHIFT;
}

static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
unsigned int order)
{
struct mlx5_mkey_cache *cache = &dev->cache;

if (order < cache->ent[0].order)
return &cache->ent[0];
order = order - cache->ent[0].order;
if (order > MKEY_CACHE_LAST_STD_ENTRY)
return NULL;
return &cache->ent[order];
}

static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
u64 length, int access_flags, u64 iova)
{
Expand Down
8 changes: 3 additions & 5 deletions drivers/infiniband/hw/mlx5/odp.c
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
return ERR_CAST(odp);

BUILD_BUG_ON(order > MKEY_CACHE_LAST_STD_ENTRY);
mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[order],
imr->access_flags);
mr = mlx5_mr_cache_alloc_order(dev, order, imr->access_flags);
if (IS_ERR(mr)) {
ib_umem_odp_release(odp);
return mr;
Expand Down Expand Up @@ -494,9 +493,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
if (IS_ERR(umem_odp))
return ERR_CAST(umem_odp);

imr = mlx5_mr_cache_alloc(dev,
&dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY],
access_flags);
imr = mlx5_mr_cache_alloc_order(dev, MLX5_IMR_KSM_CACHE_ENTRY,
access_flags);
if (IS_ERR(imr)) {
ib_umem_odp_release(umem_odp);
return imr;
Expand Down

0 comments on commit b958451

Please sign in to comment.