root/kernel/cgroup/rdma.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. css_rdmacg
  2. parent_rdmacg
  3. get_current_rdmacg
  4. set_resource_limit
  5. set_all_resource_max_limit
  6. free_cg_rpool_locked
  7. find_cg_rpool_locked
  8. get_cg_rpool_locked
  9. uncharge_cg_locked
  10. rdmacg_uncharge_hierarchy
  11. rdmacg_uncharge
  12. rdmacg_try_charge
  13. rdmacg_register_device
  14. rdmacg_unregister_device
  15. parse_resource
  16. rdmacg_parse_limits
  17. rdmacg_get_device_locked
  18. rdmacg_resource_set_max
  19. print_rpool_values
  20. rdmacg_resource_read
  21. rdmacg_css_alloc
  22. rdmacg_css_free
  23. rdmacg_css_offline

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * RDMA resource limiting controller for cgroups.
   4  *
   5  * Used to allow a cgroup hierarchy to stop processes from consuming
   6  * additional RDMA resources after a certain limit is reached.
   7  *
   8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
   9  */
  10 
  11 #include <linux/bitops.h>
  12 #include <linux/slab.h>
  13 #include <linux/seq_file.h>
  14 #include <linux/cgroup.h>
  15 #include <linux/parser.h>
  16 #include <linux/cgroup_rdma.h>
  17 
  18 #define RDMACG_MAX_STR "max"
  19 
  20 /*
  21  * Protects list of resource pools maintained on per cgroup basis
  22  * and rdma device list.
  23  */
  24 static DEFINE_MUTEX(rdmacg_mutex);
  25 static LIST_HEAD(rdmacg_devices);
  26 
  27 enum rdmacg_file_type {
  28         RDMACG_RESOURCE_TYPE_MAX,
  29         RDMACG_RESOURCE_TYPE_STAT,
  30 };
  31 
  32 /*
  33  * resource table definition as to be seen by the user.
  34  * Need to add entries to it when more resources are
  35  * added/defined at IB verb/core layer.
  36  */
  37 static char const *rdmacg_resource_names[] = {
  38         [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
  39         [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
  40 };
  41 
  42 /* resource tracker for each resource of rdma cgroup */
  43 struct rdmacg_resource {
  44         int max;
  45         int usage;
  46 };
  47 
  48 /*
  49  * resource pool object which represents per cgroup, per device
  50  * resources. There are multiple instances of this object per cgroup,
  51  * therefore it cannot be embedded within rdma_cgroup structure. It
  52  * is maintained as list.
  53  */
  54 struct rdmacg_resource_pool {
  55         struct rdmacg_device    *device;
  56         struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
  57 
  58         struct list_head        cg_node;
  59         struct list_head        dev_node;
  60 
  61         /* count active user tasks of this pool */
  62         u64                     usage_sum;
  63         /* total number counts which are set to max */
  64         int                     num_max_cnt;
  65 };
  66 
  67 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
  68 {
  69         return container_of(css, struct rdma_cgroup, css);
  70 }
  71 
  72 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
  73 {
  74         return css_rdmacg(cg->css.parent);
  75 }
  76 
  77 static inline struct rdma_cgroup *get_current_rdmacg(void)
  78 {
  79         return css_rdmacg(task_get_css(current, rdma_cgrp_id));
  80 }
  81 
  82 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
  83                                int index, int new_max)
  84 {
  85         if (new_max == S32_MAX) {
  86                 if (rpool->resources[index].max != S32_MAX)
  87                         rpool->num_max_cnt++;
  88         } else {
  89                 if (rpool->resources[index].max == S32_MAX)
  90                         rpool->num_max_cnt--;
  91         }
  92         rpool->resources[index].max = new_max;
  93 }
  94 
  95 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
  96 {
  97         int i;
  98 
  99         for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
 100                 set_resource_limit(rpool, i, S32_MAX);
 101 }
 102 
 103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
 104 {
 105         lockdep_assert_held(&rdmacg_mutex);
 106 
 107         list_del(&rpool->cg_node);
 108         list_del(&rpool->dev_node);
 109         kfree(rpool);
 110 }
 111 
 112 static struct rdmacg_resource_pool *
 113 find_cg_rpool_locked(struct rdma_cgroup *cg,
 114                      struct rdmacg_device *device)
 115 
 116 {
 117         struct rdmacg_resource_pool *pool;
 118 
 119         lockdep_assert_held(&rdmacg_mutex);
 120 
 121         list_for_each_entry(pool, &cg->rpools, cg_node)
 122                 if (pool->device == device)
 123                         return pool;
 124 
 125         return NULL;
 126 }
 127 
 128 static struct rdmacg_resource_pool *
 129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
 130 {
 131         struct rdmacg_resource_pool *rpool;
 132 
 133         rpool = find_cg_rpool_locked(cg, device);
 134         if (rpool)
 135                 return rpool;
 136 
 137         rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
 138         if (!rpool)
 139                 return ERR_PTR(-ENOMEM);
 140 
 141         rpool->device = device;
 142         set_all_resource_max_limit(rpool);
 143 
 144         INIT_LIST_HEAD(&rpool->cg_node);
 145         INIT_LIST_HEAD(&rpool->dev_node);
 146         list_add_tail(&rpool->cg_node, &cg->rpools);
 147         list_add_tail(&rpool->dev_node, &device->rpools);
 148         return rpool;
 149 }
 150 
 151 /**
 152  * uncharge_cg_locked - uncharge resource for rdma cgroup
 153  * @cg: pointer to cg to uncharge and all parents in hierarchy
 154  * @device: pointer to rdmacg device
 155  * @index: index of the resource to uncharge in cg (resource pool)
 156  *
 157  * It also frees the resource pool which was created as part of
 158  * charging operation when there are no resources attached to
 159  * resource pool.
 160  */
 161 static void
 162 uncharge_cg_locked(struct rdma_cgroup *cg,
 163                    struct rdmacg_device *device,
 164                    enum rdmacg_resource_type index)
 165 {
 166         struct rdmacg_resource_pool *rpool;
 167 
 168         rpool = find_cg_rpool_locked(cg, device);
 169 
 170         /*
 171          * rpool cannot be null at this stage. Let kernel operate in case
 172          * if there a bug in IB stack or rdma controller, instead of crashing
 173          * the system.
 174          */
 175         if (unlikely(!rpool)) {
 176                 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
 177                 return;
 178         }
 179 
 180         rpool->resources[index].usage--;
 181 
 182         /*
 183          * A negative count (or overflow) is invalid,
 184          * it indicates a bug in the rdma controller.
 185          */
 186         WARN_ON_ONCE(rpool->resources[index].usage < 0);
 187         rpool->usage_sum--;
 188         if (rpool->usage_sum == 0 &&
 189             rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
 190                 /*
 191                  * No user of the rpool and all entries are set to max, so
 192                  * safe to delete this rpool.
 193                  */
 194                 free_cg_rpool_locked(rpool);
 195         }
 196 }
 197 
 198 /**
 199  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
 200  * @device: pointer to rdmacg device
 201  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
 202  *           stop uncharging
 203  * @index: index of the resource to uncharge in cg in given resource pool
 204  */
 205 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
 206                                      struct rdmacg_device *device,
 207                                      struct rdma_cgroup *stop_cg,
 208                                      enum rdmacg_resource_type index)
 209 {
 210         struct rdma_cgroup *p;
 211 
 212         mutex_lock(&rdmacg_mutex);
 213 
 214         for (p = cg; p != stop_cg; p = parent_rdmacg(p))
 215                 uncharge_cg_locked(p, device, index);
 216 
 217         mutex_unlock(&rdmacg_mutex);
 218 
 219         css_put(&cg->css);
 220 }
 221 
 222 /**
 223  * rdmacg_uncharge - hierarchically uncharge rdma resource count
 224  * @device: pointer to rdmacg device
 225  * @index: index of the resource to uncharge in cgroup in given resource pool
 226  */
 227 void rdmacg_uncharge(struct rdma_cgroup *cg,
 228                      struct rdmacg_device *device,
 229                      enum rdmacg_resource_type index)
 230 {
 231         if (index >= RDMACG_RESOURCE_MAX)
 232                 return;
 233 
 234         rdmacg_uncharge_hierarchy(cg, device, NULL, index);
 235 }
 236 EXPORT_SYMBOL(rdmacg_uncharge);
 237 
 238 /**
 239  * rdmacg_try_charge - hierarchically try to charge the rdma resource
 240  * @rdmacg: pointer to rdma cgroup which will own this resource
 241  * @device: pointer to rdmacg device
 242  * @index: index of the resource to charge in cgroup (resource pool)
 243  *
 244  * This function follows charging resource in hierarchical way.
 245  * It will fail if the charge would cause the new value to exceed the
 246  * hierarchical limit.
 247  * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
 248  * Returns pointer to rdmacg for this resource when charging is successful.
 249  *
 250  * Charger needs to account resources on two criteria.
 251  * (a) per cgroup & (b) per device resource usage.
 252  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
 253  * the configured limits. Per device provides granular configuration
 254  * in multi device usage. It allocates resource pool in the hierarchy
 255  * for each parent it come across for first resource. Later on resource
 256  * pool will be available. Therefore it will be much faster thereon
 257  * to charge/uncharge.
 258  */
 259 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 260                       struct rdmacg_device *device,
 261                       enum rdmacg_resource_type index)
 262 {
 263         struct rdma_cgroup *cg, *p;
 264         struct rdmacg_resource_pool *rpool;
 265         s64 new;
 266         int ret = 0;
 267 
 268         if (index >= RDMACG_RESOURCE_MAX)
 269                 return -EINVAL;
 270 
 271         /*
 272          * hold on to css, as cgroup can be removed but resource
 273          * accounting happens on css.
 274          */
 275         cg = get_current_rdmacg();
 276 
 277         mutex_lock(&rdmacg_mutex);
 278         for (p = cg; p; p = parent_rdmacg(p)) {
 279                 rpool = get_cg_rpool_locked(p, device);
 280                 if (IS_ERR(rpool)) {
 281                         ret = PTR_ERR(rpool);
 282                         goto err;
 283                 } else {
 284                         new = rpool->resources[index].usage + 1;
 285                         if (new > rpool->resources[index].max) {
 286                                 ret = -EAGAIN;
 287                                 goto err;
 288                         } else {
 289                                 rpool->resources[index].usage = new;
 290                                 rpool->usage_sum++;
 291                         }
 292                 }
 293         }
 294         mutex_unlock(&rdmacg_mutex);
 295 
 296         *rdmacg = cg;
 297         return 0;
 298 
 299 err:
 300         mutex_unlock(&rdmacg_mutex);
 301         rdmacg_uncharge_hierarchy(cg, device, p, index);
 302         return ret;
 303 }
 304 EXPORT_SYMBOL(rdmacg_try_charge);
 305 
 306 /**
 307  * rdmacg_register_device - register rdmacg device to rdma controller.
 308  * @device: pointer to rdmacg device whose resources need to be accounted.
 309  *
 310  * If IB stack wish a device to participate in rdma cgroup resource
 311  * tracking, it must invoke this API to register with rdma cgroup before
 312  * any user space application can start using the RDMA resources.
 313  */
 314 void rdmacg_register_device(struct rdmacg_device *device)
 315 {
 316         INIT_LIST_HEAD(&device->dev_node);
 317         INIT_LIST_HEAD(&device->rpools);
 318 
 319         mutex_lock(&rdmacg_mutex);
 320         list_add_tail(&device->dev_node, &rdmacg_devices);
 321         mutex_unlock(&rdmacg_mutex);
 322 }
 323 EXPORT_SYMBOL(rdmacg_register_device);
 324 
 325 /**
 326  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
 327  * @device: pointer to rdmacg device which was previously registered with rdma
 328  *          controller using rdmacg_register_device().
 329  *
 330  * IB stack must invoke this after all the resources of the IB device
 331  * are destroyed and after ensuring that no more resources will be created
 332  * when this API is invoked.
 333  */
 334 void rdmacg_unregister_device(struct rdmacg_device *device)
 335 {
 336         struct rdmacg_resource_pool *rpool, *tmp;
 337 
 338         /*
 339          * Synchronize with any active resource settings,
 340          * usage query happening via configfs.
 341          */
 342         mutex_lock(&rdmacg_mutex);
 343         list_del_init(&device->dev_node);
 344 
 345         /*
 346          * Now that this device is off the cgroup list, its safe to free
 347          * all the rpool resources.
 348          */
 349         list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
 350                 free_cg_rpool_locked(rpool);
 351 
 352         mutex_unlock(&rdmacg_mutex);
 353 }
 354 EXPORT_SYMBOL(rdmacg_unregister_device);
 355 
 356 static int parse_resource(char *c, int *intval)
 357 {
 358         substring_t argstr;
 359         char *name, *value = c;
 360         size_t len;
 361         int ret, i;
 362 
 363         name = strsep(&value, "=");
 364         if (!name || !value)
 365                 return -EINVAL;
 366 
 367         i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
 368         if (i < 0)
 369                 return i;
 370 
 371         len = strlen(value);
 372 
 373         argstr.from = value;
 374         argstr.to = value + len;
 375 
 376         ret = match_int(&argstr, intval);
 377         if (ret >= 0) {
 378                 if (*intval < 0)
 379                         return -EINVAL;
 380                 return i;
 381         }
 382         if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
 383                 *intval = S32_MAX;
 384                 return i;
 385         }
 386         return -EINVAL;
 387 }
 388 
 389 static int rdmacg_parse_limits(char *options,
 390                                int *new_limits, unsigned long *enables)
 391 {
 392         char *c;
 393         int err = -EINVAL;
 394 
 395         /* parse resource options */
 396         while ((c = strsep(&options, " ")) != NULL) {
 397                 int index, intval;
 398 
 399                 index = parse_resource(c, &intval);
 400                 if (index < 0)
 401                         goto err;
 402 
 403                 new_limits[index] = intval;
 404                 *enables |= BIT(index);
 405         }
 406         return 0;
 407 
 408 err:
 409         return err;
 410 }
 411 
 412 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
 413 {
 414         struct rdmacg_device *device;
 415 
 416         lockdep_assert_held(&rdmacg_mutex);
 417 
 418         list_for_each_entry(device, &rdmacg_devices, dev_node)
 419                 if (!strcmp(name, device->name))
 420                         return device;
 421 
 422         return NULL;
 423 }
 424 
 425 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 426                                        char *buf, size_t nbytes, loff_t off)
 427 {
 428         struct rdma_cgroup *cg = css_rdmacg(of_css(of));
 429         const char *dev_name;
 430         struct rdmacg_resource_pool *rpool;
 431         struct rdmacg_device *device;
 432         char *options = strstrip(buf);
 433         int *new_limits;
 434         unsigned long enables = 0;
 435         int i = 0, ret = 0;
 436 
 437         /* extract the device name first */
 438         dev_name = strsep(&options, " ");
 439         if (!dev_name) {
 440                 ret = -EINVAL;
 441                 goto err;
 442         }
 443 
 444         new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
 445         if (!new_limits) {
 446                 ret = -ENOMEM;
 447                 goto err;
 448         }
 449 
 450         ret = rdmacg_parse_limits(options, new_limits, &enables);
 451         if (ret)
 452                 goto parse_err;
 453 
 454         /* acquire lock to synchronize with hot plug devices */
 455         mutex_lock(&rdmacg_mutex);
 456 
 457         device = rdmacg_get_device_locked(dev_name);
 458         if (!device) {
 459                 ret = -ENODEV;
 460                 goto dev_err;
 461         }
 462 
 463         rpool = get_cg_rpool_locked(cg, device);
 464         if (IS_ERR(rpool)) {
 465                 ret = PTR_ERR(rpool);
 466                 goto dev_err;
 467         }
 468 
 469         /* now set the new limits of the rpool */
 470         for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
 471                 set_resource_limit(rpool, i, new_limits[i]);
 472 
 473         if (rpool->usage_sum == 0 &&
 474             rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
 475                 /*
 476                  * No user of the rpool and all entries are set to max, so
 477                  * safe to delete this rpool.
 478                  */
 479                 free_cg_rpool_locked(rpool);
 480         }
 481 
 482 dev_err:
 483         mutex_unlock(&rdmacg_mutex);
 484 
 485 parse_err:
 486         kfree(new_limits);
 487 
 488 err:
 489         return ret ?: nbytes;
 490 }
 491 
 492 static void print_rpool_values(struct seq_file *sf,
 493                                struct rdmacg_resource_pool *rpool)
 494 {
 495         enum rdmacg_file_type sf_type;
 496         int i;
 497         u32 value;
 498 
 499         sf_type = seq_cft(sf)->private;
 500 
 501         for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
 502                 seq_puts(sf, rdmacg_resource_names[i]);
 503                 seq_putc(sf, '=');
 504                 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
 505                         if (rpool)
 506                                 value = rpool->resources[i].max;
 507                         else
 508                                 value = S32_MAX;
 509                 } else {
 510                         if (rpool)
 511                                 value = rpool->resources[i].usage;
 512                         else
 513                                 value = 0;
 514                 }
 515 
 516                 if (value == S32_MAX)
 517                         seq_puts(sf, RDMACG_MAX_STR);
 518                 else
 519                         seq_printf(sf, "%d", value);
 520                 seq_putc(sf, ' ');
 521         }
 522 }
 523 
 524 static int rdmacg_resource_read(struct seq_file *sf, void *v)
 525 {
 526         struct rdmacg_device *device;
 527         struct rdmacg_resource_pool *rpool;
 528         struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
 529 
 530         mutex_lock(&rdmacg_mutex);
 531 
 532         list_for_each_entry(device, &rdmacg_devices, dev_node) {
 533                 seq_printf(sf, "%s ", device->name);
 534 
 535                 rpool = find_cg_rpool_locked(cg, device);
 536                 print_rpool_values(sf, rpool);
 537 
 538                 seq_putc(sf, '\n');
 539         }
 540 
 541         mutex_unlock(&rdmacg_mutex);
 542         return 0;
 543 }
 544 
 545 static struct cftype rdmacg_files[] = {
 546         {
 547                 .name = "max",
 548                 .write = rdmacg_resource_set_max,
 549                 .seq_show = rdmacg_resource_read,
 550                 .private = RDMACG_RESOURCE_TYPE_MAX,
 551                 .flags = CFTYPE_NOT_ON_ROOT,
 552         },
 553         {
 554                 .name = "current",
 555                 .seq_show = rdmacg_resource_read,
 556                 .private = RDMACG_RESOURCE_TYPE_STAT,
 557                 .flags = CFTYPE_NOT_ON_ROOT,
 558         },
 559         { }     /* terminate */
 560 };
 561 
 562 static struct cgroup_subsys_state *
 563 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
 564 {
 565         struct rdma_cgroup *cg;
 566 
 567         cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 568         if (!cg)
 569                 return ERR_PTR(-ENOMEM);
 570 
 571         INIT_LIST_HEAD(&cg->rpools);
 572         return &cg->css;
 573 }
 574 
 575 static void rdmacg_css_free(struct cgroup_subsys_state *css)
 576 {
 577         struct rdma_cgroup *cg = css_rdmacg(css);
 578 
 579         kfree(cg);
 580 }
 581 
 582 /**
 583  * rdmacg_css_offline - cgroup css_offline callback
 584  * @css: css of interest
 585  *
 586  * This function is called when @css is about to go away and responsible
 587  * for shooting down all rdmacg associated with @css. As part of that it
 588  * marks all the resource pool entries to max value, so that when resources are
 589  * uncharged, associated resource pool can be freed as well.
 590  */
 591 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
 592 {
 593         struct rdma_cgroup *cg = css_rdmacg(css);
 594         struct rdmacg_resource_pool *rpool;
 595 
 596         mutex_lock(&rdmacg_mutex);
 597 
 598         list_for_each_entry(rpool, &cg->rpools, cg_node)
 599                 set_all_resource_max_limit(rpool);
 600 
 601         mutex_unlock(&rdmacg_mutex);
 602 }
 603 
 604 struct cgroup_subsys rdma_cgrp_subsys = {
 605         .css_alloc      = rdmacg_css_alloc,
 606         .css_free       = rdmacg_css_free,
 607         .css_offline    = rdmacg_css_offline,
 608         .legacy_cftypes = rdmacg_files,
 609         .dfl_cftypes    = rdmacg_files,
 610 };

/* [<][>][^][v][top][bottom][index][help] */