cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

device.c (78350B)


      1/*
      2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
      3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
      4 *
      5 * This software is available to you under a choice of one of two
      6 * licenses.  You may choose to be licensed under the terms of the GNU
      7 * General Public License (GPL) Version 2, available from the file
      8 * COPYING in the main directory of this source tree, or the
      9 * OpenIB.org BSD license below:
     10 *
     11 *     Redistribution and use in source and binary forms, with or
     12 *     without modification, are permitted provided that the following
     13 *     conditions are met:
     14 *
     15 *      - Redistributions of source code must retain the above
     16 *        copyright notice, this list of conditions and the following
     17 *        disclaimer.
     18 *
     19 *      - Redistributions in binary form must reproduce the above
     20 *        copyright notice, this list of conditions and the following
     21 *        disclaimer in the documentation and/or other materials
     22 *        provided with the distribution.
     23 *
     24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     31 * SOFTWARE.
     32 */
     33
     34#include <linux/module.h>
     35#include <linux/string.h>
     36#include <linux/errno.h>
     37#include <linux/kernel.h>
     38#include <linux/slab.h>
     39#include <linux/init.h>
     40#include <linux/netdevice.h>
     41#include <net/net_namespace.h>
     42#include <linux/security.h>
     43#include <linux/notifier.h>
     44#include <linux/hashtable.h>
     45#include <rdma/rdma_netlink.h>
     46#include <rdma/ib_addr.h>
     47#include <rdma/ib_cache.h>
     48#include <rdma/rdma_counter.h>
     49
     50#include "core_priv.h"
     51#include "restrack.h"
     52
     53MODULE_AUTHOR("Roland Dreier");
     54MODULE_DESCRIPTION("core kernel InfiniBand API");
     55MODULE_LICENSE("Dual BSD/GPL");
     56
     57struct workqueue_struct *ib_comp_wq;
     58struct workqueue_struct *ib_comp_unbound_wq;
     59struct workqueue_struct *ib_wq;
     60EXPORT_SYMBOL_GPL(ib_wq);
     61static struct workqueue_struct *ib_unreg_wq;
     62
     63/*
     64 * Each of the three rwsem locks (devices, clients, client_data) protects the
     65 * xarray of the same name. Specifically it allows the caller to assert that
     66 * the MARK will/will not be changing under the lock, and for devices and
     67 * clients, that the value in the xarray is still a valid pointer. Change of
     68 * the MARK is linked to the object state, so holding the lock and testing the
     69 * MARK also asserts that the contained object is in a certain state.
     70 *
     71 * This is used to build a two stage register/unregister flow where objects
     72 * can continue to be in the xarray even though they are still in progress to
     73 * register/unregister.
     74 *
     75 * The xarray itself provides additional locking, and restartable iteration,
     76 * which is also relied on.
     77 *
     78 * Locks should not be nested, with the exception of client_data, which is
     79 * allowed to nest under the read side of the other two locks.
     80 *
     81 * The devices_rwsem also protects the device name list, any change or
     82 * assignment of device name must also hold the write side to guarantee unique
     83 * names.
     84 */
     85
     86/*
     87 * devices contains devices that have had their names assigned. The
     88 * devices may not be registered. Users that care about the registration
     89 * status need to call ib_device_try_get() on the device to ensure it is
     90 * registered, and keep it registered, for the required duration.
     91 *
     92 */
     93static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
     94static DECLARE_RWSEM(devices_rwsem);
     95#define DEVICE_REGISTERED XA_MARK_1
     96
     97static u32 highest_client_id;
     98#define CLIENT_REGISTERED XA_MARK_1
     99static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
    100static DECLARE_RWSEM(clients_rwsem);
    101
    102static void ib_client_put(struct ib_client *client)
    103{
    104	if (refcount_dec_and_test(&client->uses))
    105		complete(&client->uses_zero);
    106}
    107
    108/*
    109 * If client_data is registered then the corresponding client must also still
    110 * be registered.
    111 */
    112#define CLIENT_DATA_REGISTERED XA_MARK_1
    113
    114unsigned int rdma_dev_net_id;
    115
    116/*
    117 * A list of net namespaces is maintained in an xarray. This is necessary
    118 * because we can't get the locking right using the existing net ns list. We
    119 * would require a init_net callback after the list is updated.
    120 */
    121static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
    122/*
    123 * rwsem to protect accessing the rdma_nets xarray entries.
    124 */
    125static DECLARE_RWSEM(rdma_nets_rwsem);
    126
    127bool ib_devices_shared_netns = true;
    128module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
    129MODULE_PARM_DESC(netns_mode,
    130		 "Share device among net namespaces; default=1 (shared)");
    131/**
    132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed
    133 *			     from a specified net namespace or not.
    134 * @dev:	Pointer to rdma device which needs to be checked
    135 * @net:	Pointer to net namesapce for which access to be checked
    136 *
    137 * When the rdma device is in shared mode, it ignores the net namespace.
    138 * When the rdma device is exclusive to a net namespace, rdma device net
    139 * namespace is checked against the specified one.
    140 */
    141bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
    142{
    143	return (ib_devices_shared_netns ||
    144		net_eq(read_pnet(&dev->coredev.rdma_net), net));
    145}
    146EXPORT_SYMBOL(rdma_dev_access_netns);
    147
    148/*
    149 * xarray has this behavior where it won't iterate over NULL values stored in
    150 * allocated arrays.  So we need our own iterator to see all values stored in
    151 * the array. This does the same thing as xa_for_each except that it also
    152 * returns NULL valued entries if the array is allocating. Simplified to only
    153 * work on simple xarrays.
    154 */
    155static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
    156			     xa_mark_t filter)
    157{
    158	XA_STATE(xas, xa, *indexp);
    159	void *entry;
    160
    161	rcu_read_lock();
    162	do {
    163		entry = xas_find_marked(&xas, ULONG_MAX, filter);
    164		if (xa_is_zero(entry))
    165			break;
    166	} while (xas_retry(&xas, entry));
    167	rcu_read_unlock();
    168
    169	if (entry) {
    170		*indexp = xas.xa_index;
    171		if (xa_is_zero(entry))
    172			return NULL;
    173		return entry;
    174	}
    175	return XA_ERROR(-ENOENT);
    176}
    177#define xan_for_each_marked(xa, index, entry, filter)                          \
    178	for (index = 0, entry = xan_find_marked(xa, &(index), filter);         \
    179	     !xa_is_err(entry);                                                \
    180	     (index)++, entry = xan_find_marked(xa, &(index), filter))
    181
    182/* RCU hash table mapping netdevice pointers to struct ib_port_data */
    183static DEFINE_SPINLOCK(ndev_hash_lock);
    184static DECLARE_HASHTABLE(ndev_hash, 5);
    185
    186static void free_netdevs(struct ib_device *ib_dev);
    187static void ib_unregister_work(struct work_struct *work);
    188static void __ib_unregister_device(struct ib_device *device);
    189static int ib_security_change(struct notifier_block *nb, unsigned long event,
    190			      void *lsm_data);
    191static void ib_policy_change_task(struct work_struct *work);
    192static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
    193
    194static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
    195			   struct va_format *vaf)
    196{
    197	if (ibdev && ibdev->dev.parent)
    198		dev_printk_emit(level[1] - '0',
    199				ibdev->dev.parent,
    200				"%s %s %s: %pV",
    201				dev_driver_string(ibdev->dev.parent),
    202				dev_name(ibdev->dev.parent),
    203				dev_name(&ibdev->dev),
    204				vaf);
    205	else if (ibdev)
    206		printk("%s%s: %pV",
    207		       level, dev_name(&ibdev->dev), vaf);
    208	else
    209		printk("%s(NULL ib_device): %pV", level, vaf);
    210}
    211
    212void ibdev_printk(const char *level, const struct ib_device *ibdev,
    213		  const char *format, ...)
    214{
    215	struct va_format vaf;
    216	va_list args;
    217
    218	va_start(args, format);
    219
    220	vaf.fmt = format;
    221	vaf.va = &args;
    222
    223	__ibdev_printk(level, ibdev, &vaf);
    224
    225	va_end(args);
    226}
    227EXPORT_SYMBOL(ibdev_printk);
    228
    229#define define_ibdev_printk_level(func, level)                  \
    230void func(const struct ib_device *ibdev, const char *fmt, ...)  \
    231{                                                               \
    232	struct va_format vaf;                                   \
    233	va_list args;                                           \
    234								\
    235	va_start(args, fmt);                                    \
    236								\
    237	vaf.fmt = fmt;                                          \
    238	vaf.va = &args;                                         \
    239								\
    240	__ibdev_printk(level, ibdev, &vaf);                     \
    241								\
    242	va_end(args);                                           \
    243}                                                               \
    244EXPORT_SYMBOL(func);
    245
    246define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
    247define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
    248define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
    249define_ibdev_printk_level(ibdev_err, KERN_ERR);
    250define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
    251define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
    252define_ibdev_printk_level(ibdev_info, KERN_INFO);
    253
    254static struct notifier_block ibdev_lsm_nb = {
    255	.notifier_call = ib_security_change,
    256};
    257
    258static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
    259				 struct net *net);
    260
    261/* Pointer to the RCU head at the start of the ib_port_data array */
    262struct ib_port_data_rcu {
    263	struct rcu_head rcu_head;
    264	struct ib_port_data pdata[];
    265};
    266
    267static void ib_device_check_mandatory(struct ib_device *device)
    268{
    269#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
    270	static const struct {
    271		size_t offset;
    272		char  *name;
    273	} mandatory_table[] = {
    274		IB_MANDATORY_FUNC(query_device),
    275		IB_MANDATORY_FUNC(query_port),
    276		IB_MANDATORY_FUNC(alloc_pd),
    277		IB_MANDATORY_FUNC(dealloc_pd),
    278		IB_MANDATORY_FUNC(create_qp),
    279		IB_MANDATORY_FUNC(modify_qp),
    280		IB_MANDATORY_FUNC(destroy_qp),
    281		IB_MANDATORY_FUNC(post_send),
    282		IB_MANDATORY_FUNC(post_recv),
    283		IB_MANDATORY_FUNC(create_cq),
    284		IB_MANDATORY_FUNC(destroy_cq),
    285		IB_MANDATORY_FUNC(poll_cq),
    286		IB_MANDATORY_FUNC(req_notify_cq),
    287		IB_MANDATORY_FUNC(get_dma_mr),
    288		IB_MANDATORY_FUNC(reg_user_mr),
    289		IB_MANDATORY_FUNC(dereg_mr),
    290		IB_MANDATORY_FUNC(get_port_immutable)
    291	};
    292	int i;
    293
    294	device->kverbs_provider = true;
    295	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
    296		if (!*(void **) ((void *) &device->ops +
    297				 mandatory_table[i].offset)) {
    298			device->kverbs_provider = false;
    299			break;
    300		}
    301	}
    302}
    303
    304/*
    305 * Caller must perform ib_device_put() to return the device reference count
    306 * when ib_device_get_by_index() returns valid device pointer.
    307 */
    308struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
    309{
    310	struct ib_device *device;
    311
    312	down_read(&devices_rwsem);
    313	device = xa_load(&devices, index);
    314	if (device) {
    315		if (!rdma_dev_access_netns(device, net)) {
    316			device = NULL;
    317			goto out;
    318		}
    319
    320		if (!ib_device_try_get(device))
    321			device = NULL;
    322	}
    323out:
    324	up_read(&devices_rwsem);
    325	return device;
    326}
    327
    328/**
    329 * ib_device_put - Release IB device reference
    330 * @device: device whose reference to be released
    331 *
    332 * ib_device_put() releases reference to the IB device to allow it to be
    333 * unregistered and eventually free.
    334 */
    335void ib_device_put(struct ib_device *device)
    336{
    337	if (refcount_dec_and_test(&device->refcount))
    338		complete(&device->unreg_completion);
    339}
    340EXPORT_SYMBOL(ib_device_put);
    341
    342static struct ib_device *__ib_device_get_by_name(const char *name)
    343{
    344	struct ib_device *device;
    345	unsigned long index;
    346
    347	xa_for_each (&devices, index, device)
    348		if (!strcmp(name, dev_name(&device->dev)))
    349			return device;
    350
    351	return NULL;
    352}
    353
    354/**
    355 * ib_device_get_by_name - Find an IB device by name
    356 * @name: The name to look for
    357 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
    358 *
    359 * Find and hold an ib_device by its name. The caller must call
    360 * ib_device_put() on the returned pointer.
    361 */
    362struct ib_device *ib_device_get_by_name(const char *name,
    363					enum rdma_driver_id driver_id)
    364{
    365	struct ib_device *device;
    366
    367	down_read(&devices_rwsem);
    368	device = __ib_device_get_by_name(name);
    369	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
    370	    device->ops.driver_id != driver_id)
    371		device = NULL;
    372
    373	if (device) {
    374		if (!ib_device_try_get(device))
    375			device = NULL;
    376	}
    377	up_read(&devices_rwsem);
    378	return device;
    379}
    380EXPORT_SYMBOL(ib_device_get_by_name);
    381
    382static int rename_compat_devs(struct ib_device *device)
    383{
    384	struct ib_core_device *cdev;
    385	unsigned long index;
    386	int ret = 0;
    387
    388	mutex_lock(&device->compat_devs_mutex);
    389	xa_for_each (&device->compat_devs, index, cdev) {
    390		ret = device_rename(&cdev->dev, dev_name(&device->dev));
    391		if (ret) {
    392			dev_warn(&cdev->dev,
    393				 "Fail to rename compatdev to new name %s\n",
    394				 dev_name(&device->dev));
    395			break;
    396		}
    397	}
    398	mutex_unlock(&device->compat_devs_mutex);
    399	return ret;
    400}
    401
    402int ib_device_rename(struct ib_device *ibdev, const char *name)
    403{
    404	unsigned long index;
    405	void *client_data;
    406	int ret;
    407
    408	down_write(&devices_rwsem);
    409	if (!strcmp(name, dev_name(&ibdev->dev))) {
    410		up_write(&devices_rwsem);
    411		return 0;
    412	}
    413
    414	if (__ib_device_get_by_name(name)) {
    415		up_write(&devices_rwsem);
    416		return -EEXIST;
    417	}
    418
    419	ret = device_rename(&ibdev->dev, name);
    420	if (ret) {
    421		up_write(&devices_rwsem);
    422		return ret;
    423	}
    424
    425	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
    426	ret = rename_compat_devs(ibdev);
    427
    428	downgrade_write(&devices_rwsem);
    429	down_read(&ibdev->client_data_rwsem);
    430	xan_for_each_marked(&ibdev->client_data, index, client_data,
    431			    CLIENT_DATA_REGISTERED) {
    432		struct ib_client *client = xa_load(&clients, index);
    433
    434		if (!client || !client->rename)
    435			continue;
    436
    437		client->rename(ibdev, client_data);
    438	}
    439	up_read(&ibdev->client_data_rwsem);
    440	up_read(&devices_rwsem);
    441	return 0;
    442}
    443
    444int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
    445{
    446	if (use_dim > 1)
    447		return -EINVAL;
    448	ibdev->use_cq_dim = use_dim;
    449
    450	return 0;
    451}
    452
    453static int alloc_name(struct ib_device *ibdev, const char *name)
    454{
    455	struct ib_device *device;
    456	unsigned long index;
    457	struct ida inuse;
    458	int rc;
    459	int i;
    460
    461	lockdep_assert_held_write(&devices_rwsem);
    462	ida_init(&inuse);
    463	xa_for_each (&devices, index, device) {
    464		char buf[IB_DEVICE_NAME_MAX];
    465
    466		if (sscanf(dev_name(&device->dev), name, &i) != 1)
    467			continue;
    468		if (i < 0 || i >= INT_MAX)
    469			continue;
    470		snprintf(buf, sizeof buf, name, i);
    471		if (strcmp(buf, dev_name(&device->dev)) != 0)
    472			continue;
    473
    474		rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
    475		if (rc < 0)
    476			goto out;
    477	}
    478
    479	rc = ida_alloc(&inuse, GFP_KERNEL);
    480	if (rc < 0)
    481		goto out;
    482
    483	rc = dev_set_name(&ibdev->dev, name, rc);
    484out:
    485	ida_destroy(&inuse);
    486	return rc;
    487}
    488
    489static void ib_device_release(struct device *device)
    490{
    491	struct ib_device *dev = container_of(device, struct ib_device, dev);
    492
    493	free_netdevs(dev);
    494	WARN_ON(refcount_read(&dev->refcount));
    495	if (dev->hw_stats_data)
    496		ib_device_release_hw_stats(dev->hw_stats_data);
    497	if (dev->port_data) {
    498		ib_cache_release_one(dev);
    499		ib_security_release_port_pkey_list(dev);
    500		rdma_counter_release(dev);
    501		kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
    502				       pdata[0]),
    503			  rcu_head);
    504	}
    505
    506	mutex_destroy(&dev->unregistration_lock);
    507	mutex_destroy(&dev->compat_devs_mutex);
    508
    509	xa_destroy(&dev->compat_devs);
    510	xa_destroy(&dev->client_data);
    511	kfree_rcu(dev, rcu_head);
    512}
    513
    514static int ib_device_uevent(struct device *device,
    515			    struct kobj_uevent_env *env)
    516{
    517	if (add_uevent_var(env, "NAME=%s", dev_name(device)))
    518		return -ENOMEM;
    519
    520	/*
    521	 * It would be nice to pass the node GUID with the event...
    522	 */
    523
    524	return 0;
    525}
    526
    527static const void *net_namespace(struct device *d)
    528{
    529	struct ib_core_device *coredev =
    530			container_of(d, struct ib_core_device, dev);
    531
    532	return read_pnet(&coredev->rdma_net);
    533}
    534
    535static struct class ib_class = {
    536	.name    = "infiniband",
    537	.dev_release = ib_device_release,
    538	.dev_uevent = ib_device_uevent,
    539	.ns_type = &net_ns_type_operations,
    540	.namespace = net_namespace,
    541};
    542
    543static void rdma_init_coredev(struct ib_core_device *coredev,
    544			      struct ib_device *dev, struct net *net)
    545{
    546	/* This BUILD_BUG_ON is intended to catch layout change
    547	 * of union of ib_core_device and device.
    548	 * dev must be the first element as ib_core and providers
    549	 * driver uses it. Adding anything in ib_core_device before
    550	 * device will break this assumption.
    551	 */
    552	BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
    553		     offsetof(struct ib_device, dev));
    554
    555	coredev->dev.class = &ib_class;
    556	coredev->dev.groups = dev->groups;
    557	device_initialize(&coredev->dev);
    558	coredev->owner = dev;
    559	INIT_LIST_HEAD(&coredev->port_list);
    560	write_pnet(&coredev->rdma_net, net);
    561}
    562
    563/**
    564 * _ib_alloc_device - allocate an IB device struct
    565 * @size:size of structure to allocate
    566 *
    567 * Low-level drivers should use ib_alloc_device() to allocate &struct
    568 * ib_device.  @size is the size of the structure to be allocated,
    569 * including any private data used by the low-level driver.
    570 * ib_dealloc_device() must be used to free structures allocated with
    571 * ib_alloc_device().
    572 */
    573struct ib_device *_ib_alloc_device(size_t size)
    574{
    575	struct ib_device *device;
    576	unsigned int i;
    577
    578	if (WARN_ON(size < sizeof(struct ib_device)))
    579		return NULL;
    580
    581	device = kzalloc(size, GFP_KERNEL);
    582	if (!device)
    583		return NULL;
    584
    585	if (rdma_restrack_init(device)) {
    586		kfree(device);
    587		return NULL;
    588	}
    589
    590	rdma_init_coredev(&device->coredev, device, &init_net);
    591
    592	INIT_LIST_HEAD(&device->event_handler_list);
    593	spin_lock_init(&device->qp_open_list_lock);
    594	init_rwsem(&device->event_handler_rwsem);
    595	mutex_init(&device->unregistration_lock);
    596	/*
    597	 * client_data needs to be alloc because we don't want our mark to be
    598	 * destroyed if the user stores NULL in the client data.
    599	 */
    600	xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
    601	init_rwsem(&device->client_data_rwsem);
    602	xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
    603	mutex_init(&device->compat_devs_mutex);
    604	init_completion(&device->unreg_completion);
    605	INIT_WORK(&device->unregistration_work, ib_unregister_work);
    606
    607	spin_lock_init(&device->cq_pools_lock);
    608	for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++)
    609		INIT_LIST_HEAD(&device->cq_pools[i]);
    610
    611	rwlock_init(&device->cache_lock);
    612
    613	device->uverbs_cmd_mask =
    614		BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) |
    615		BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) |
    616		BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) |
    617		BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) |
    618		BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) |
    619		BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
    620		BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) |
    621		BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) |
    622		BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) |
    623		BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) |
    624		BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) |
    625		BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) |
    626		BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) |
    627		BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) |
    628		BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) |
    629		BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) |
    630		BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) |
    631		BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) |
    632		BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) |
    633		BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) |
    634		BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) |
    635		BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) |
    636		BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) |
    637		BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) |
    638		BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) |
    639		BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) |
    640		BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) |
    641		BIT_ULL(IB_USER_VERBS_CMD_REG_MR) |
    642		BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) |
    643		BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ);
    644	return device;
    645}
    646EXPORT_SYMBOL(_ib_alloc_device);
    647
    648/**
    649 * ib_dealloc_device - free an IB device struct
    650 * @device:structure to free
    651 *
    652 * Free a structure allocated with ib_alloc_device().
    653 */
    654void ib_dealloc_device(struct ib_device *device)
    655{
    656	if (device->ops.dealloc_driver)
    657		device->ops.dealloc_driver(device);
    658
    659	/*
    660	 * ib_unregister_driver() requires all devices to remain in the xarray
    661	 * while their ops are callable. The last op we call is dealloc_driver
    662	 * above.  This is needed to create a fence on op callbacks prior to
    663	 * allowing the driver module to unload.
    664	 */
    665	down_write(&devices_rwsem);
    666	if (xa_load(&devices, device->index) == device)
    667		xa_erase(&devices, device->index);
    668	up_write(&devices_rwsem);
    669
    670	/* Expedite releasing netdev references */
    671	free_netdevs(device);
    672
    673	WARN_ON(!xa_empty(&device->compat_devs));
    674	WARN_ON(!xa_empty(&device->client_data));
    675	WARN_ON(refcount_read(&device->refcount));
    676	rdma_restrack_clean(device);
    677	/* Balances with device_initialize */
    678	put_device(&device->dev);
    679}
    680EXPORT_SYMBOL(ib_dealloc_device);
    681
    682/*
    683 * add_client_context() and remove_client_context() must be safe against
    684 * parallel calls on the same device - registration/unregistration of both the
    685 * device and client can be occurring in parallel.
    686 *
    687 * The routines need to be a fence, any caller must not return until the add
    688 * or remove is fully completed.
    689 */
    690static int add_client_context(struct ib_device *device,
    691			      struct ib_client *client)
    692{
    693	int ret = 0;
    694
    695	if (!device->kverbs_provider && !client->no_kverbs_req)
    696		return 0;
    697
    698	down_write(&device->client_data_rwsem);
    699	/*
    700	 * So long as the client is registered hold both the client and device
    701	 * unregistration locks.
    702	 */
    703	if (!refcount_inc_not_zero(&client->uses))
    704		goto out_unlock;
    705	refcount_inc(&device->refcount);
    706
    707	/*
    708	 * Another caller to add_client_context got here first and has already
    709	 * completely initialized context.
    710	 */
    711	if (xa_get_mark(&device->client_data, client->client_id,
    712		    CLIENT_DATA_REGISTERED))
    713		goto out;
    714
    715	ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
    716			      GFP_KERNEL));
    717	if (ret)
    718		goto out;
    719	downgrade_write(&device->client_data_rwsem);
    720	if (client->add) {
    721		if (client->add(device)) {
    722			/*
    723			 * If a client fails to add then the error code is
    724			 * ignored, but we won't call any more ops on this
    725			 * client.
    726			 */
    727			xa_erase(&device->client_data, client->client_id);
    728			up_read(&device->client_data_rwsem);
    729			ib_device_put(device);
    730			ib_client_put(client);
    731			return 0;
    732		}
    733	}
    734
    735	/* Readers shall not see a client until add has been completed */
    736	xa_set_mark(&device->client_data, client->client_id,
    737		    CLIENT_DATA_REGISTERED);
    738	up_read(&device->client_data_rwsem);
    739	return 0;
    740
    741out:
    742	ib_device_put(device);
    743	ib_client_put(client);
    744out_unlock:
    745	up_write(&device->client_data_rwsem);
    746	return ret;
    747}
    748
    749static void remove_client_context(struct ib_device *device,
    750				  unsigned int client_id)
    751{
    752	struct ib_client *client;
    753	void *client_data;
    754
    755	down_write(&device->client_data_rwsem);
    756	if (!xa_get_mark(&device->client_data, client_id,
    757			 CLIENT_DATA_REGISTERED)) {
    758		up_write(&device->client_data_rwsem);
    759		return;
    760	}
    761	client_data = xa_load(&device->client_data, client_id);
    762	xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
    763	client = xa_load(&clients, client_id);
    764	up_write(&device->client_data_rwsem);
    765
    766	/*
    767	 * Notice we cannot be holding any exclusive locks when calling the
    768	 * remove callback as the remove callback can recurse back into any
    769	 * public functions in this module and thus try for any locks those
    770	 * functions take.
    771	 *
    772	 * For this reason clients and drivers should not call the
    773	 * unregistration functions will holdling any locks.
    774	 */
    775	if (client->remove)
    776		client->remove(device, client_data);
    777
    778	xa_erase(&device->client_data, client_id);
    779	ib_device_put(device);
    780	ib_client_put(client);
    781}
    782
    783static int alloc_port_data(struct ib_device *device)
    784{
    785	struct ib_port_data_rcu *pdata_rcu;
    786	u32 port;
    787
    788	if (device->port_data)
    789		return 0;
    790
    791	/* This can only be called once the physical port range is defined */
    792	if (WARN_ON(!device->phys_port_cnt))
    793		return -EINVAL;
    794
    795	/* Reserve U32_MAX so the logic to go over all the ports is sane */
    796	if (WARN_ON(device->phys_port_cnt == U32_MAX))
    797		return -EINVAL;
    798
    799	/*
    800	 * device->port_data is indexed directly by the port number to make
    801	 * access to this data as efficient as possible.
    802	 *
    803	 * Therefore port_data is declared as a 1 based array with potential
    804	 * empty slots at the beginning.
    805	 */
    806	pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
    807					rdma_end_port(device) + 1),
    808			    GFP_KERNEL);
    809	if (!pdata_rcu)
    810		return -ENOMEM;
    811	/*
    812	 * The rcu_head is put in front of the port data array and the stored
    813	 * pointer is adjusted since we never need to see that member until
    814	 * kfree_rcu.
    815	 */
    816	device->port_data = pdata_rcu->pdata;
    817
    818	rdma_for_each_port (device, port) {
    819		struct ib_port_data *pdata = &device->port_data[port];
    820
    821		pdata->ib_dev = device;
    822		spin_lock_init(&pdata->pkey_list_lock);
    823		INIT_LIST_HEAD(&pdata->pkey_list);
    824		spin_lock_init(&pdata->netdev_lock);
    825		INIT_HLIST_NODE(&pdata->ndev_hash_link);
    826	}
    827	return 0;
    828}
    829
    830static int verify_immutable(const struct ib_device *dev, u32 port)
    831{
    832	return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
    833			    rdma_max_mad_size(dev, port) != 0);
    834}
    835
    836static int setup_port_data(struct ib_device *device)
    837{
    838	u32 port;
    839	int ret;
    840
    841	ret = alloc_port_data(device);
    842	if (ret)
    843		return ret;
    844
    845	rdma_for_each_port (device, port) {
    846		struct ib_port_data *pdata = &device->port_data[port];
    847
    848		ret = device->ops.get_port_immutable(device, port,
    849						     &pdata->immutable);
    850		if (ret)
    851			return ret;
    852
    853		if (verify_immutable(device, port))
    854			return -EINVAL;
    855	}
    856	return 0;
    857}
    858
    859/**
    860 * ib_port_immutable_read() - Read rdma port's immutable data
    861 * @dev: IB device
    862 * @port: port number whose immutable data to read. It starts with index 1 and
    863 *        valid upto including rdma_end_port().
    864 */
    865const struct ib_port_immutable*
    866ib_port_immutable_read(struct ib_device *dev, unsigned int port)
    867{
    868	WARN_ON(!rdma_is_port_valid(dev, port));
    869	return &dev->port_data[port].immutable;
    870}
    871EXPORT_SYMBOL(ib_port_immutable_read);
    872
    873void ib_get_device_fw_str(struct ib_device *dev, char *str)
    874{
    875	if (dev->ops.get_dev_fw_str)
    876		dev->ops.get_dev_fw_str(dev, str);
    877	else
    878		str[0] = '\0';
    879}
    880EXPORT_SYMBOL(ib_get_device_fw_str);
    881
    882static void ib_policy_change_task(struct work_struct *work)
    883{
    884	struct ib_device *dev;
    885	unsigned long index;
    886
    887	down_read(&devices_rwsem);
    888	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
    889		unsigned int i;
    890
    891		rdma_for_each_port (dev, i) {
    892			u64 sp;
    893			ib_get_cached_subnet_prefix(dev, i, &sp);
    894			ib_security_cache_change(dev, i, sp);
    895		}
    896	}
    897	up_read(&devices_rwsem);
    898}
    899
    900static int ib_security_change(struct notifier_block *nb, unsigned long event,
    901			      void *lsm_data)
    902{
    903	if (event != LSM_POLICY_CHANGE)
    904		return NOTIFY_DONE;
    905
    906	schedule_work(&ib_policy_change_work);
    907	ib_mad_agent_security_change();
    908
    909	return NOTIFY_OK;
    910}
    911
    912static void compatdev_release(struct device *dev)
    913{
    914	struct ib_core_device *cdev =
    915		container_of(dev, struct ib_core_device, dev);
    916
    917	kfree(cdev);
    918}
    919
    920static int add_one_compat_dev(struct ib_device *device,
    921			      struct rdma_dev_net *rnet)
    922{
    923	struct ib_core_device *cdev;
    924	int ret;
    925
    926	lockdep_assert_held(&rdma_nets_rwsem);
    927	if (!ib_devices_shared_netns)
    928		return 0;
    929
    930	/*
    931	 * Create and add compat device in all namespaces other than where it
    932	 * is currently bound to.
    933	 */
    934	if (net_eq(read_pnet(&rnet->net),
    935		   read_pnet(&device->coredev.rdma_net)))
    936		return 0;
    937
    938	/*
    939	 * The first of init_net() or ib_register_device() to take the
    940	 * compat_devs_mutex wins and gets to add the device. Others will wait
    941	 * for completion here.
    942	 */
    943	mutex_lock(&device->compat_devs_mutex);
    944	cdev = xa_load(&device->compat_devs, rnet->id);
    945	if (cdev) {
    946		ret = 0;
    947		goto done;
    948	}
    949	ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
    950	if (ret)
    951		goto done;
    952
    953	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
    954	if (!cdev) {
    955		ret = -ENOMEM;
    956		goto cdev_err;
    957	}
    958
    959	cdev->dev.parent = device->dev.parent;
    960	rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
    961	cdev->dev.release = compatdev_release;
    962	ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
    963	if (ret)
    964		goto add_err;
    965
    966	ret = device_add(&cdev->dev);
    967	if (ret)
    968		goto add_err;
    969	ret = ib_setup_port_attrs(cdev);
    970	if (ret)
    971		goto port_err;
    972
    973	ret = xa_err(xa_store(&device->compat_devs, rnet->id,
    974			      cdev, GFP_KERNEL));
    975	if (ret)
    976		goto insert_err;
    977
    978	mutex_unlock(&device->compat_devs_mutex);
    979	return 0;
    980
    981insert_err:
    982	ib_free_port_attrs(cdev);
    983port_err:
    984	device_del(&cdev->dev);
    985add_err:
    986	put_device(&cdev->dev);
    987cdev_err:
    988	xa_release(&device->compat_devs, rnet->id);
    989done:
    990	mutex_unlock(&device->compat_devs_mutex);
    991	return ret;
    992}
    993
    994static void remove_one_compat_dev(struct ib_device *device, u32 id)
    995{
    996	struct ib_core_device *cdev;
    997
    998	mutex_lock(&device->compat_devs_mutex);
    999	cdev = xa_erase(&device->compat_devs, id);
   1000	mutex_unlock(&device->compat_devs_mutex);
   1001	if (cdev) {
   1002		ib_free_port_attrs(cdev);
   1003		device_del(&cdev->dev);
   1004		put_device(&cdev->dev);
   1005	}
   1006}
   1007
   1008static void remove_compat_devs(struct ib_device *device)
   1009{
   1010	struct ib_core_device *cdev;
   1011	unsigned long index;
   1012
   1013	xa_for_each (&device->compat_devs, index, cdev)
   1014		remove_one_compat_dev(device, index);
   1015}
   1016
   1017static int add_compat_devs(struct ib_device *device)
   1018{
   1019	struct rdma_dev_net *rnet;
   1020	unsigned long index;
   1021	int ret = 0;
   1022
   1023	lockdep_assert_held(&devices_rwsem);
   1024
   1025	down_read(&rdma_nets_rwsem);
   1026	xa_for_each (&rdma_nets, index, rnet) {
   1027		ret = add_one_compat_dev(device, rnet);
   1028		if (ret)
   1029			break;
   1030	}
   1031	up_read(&rdma_nets_rwsem);
   1032	return ret;
   1033}
   1034
   1035static void remove_all_compat_devs(void)
   1036{
   1037	struct ib_compat_device *cdev;
   1038	struct ib_device *dev;
   1039	unsigned long index;
   1040
   1041	down_read(&devices_rwsem);
   1042	xa_for_each (&devices, index, dev) {
   1043		unsigned long c_index = 0;
   1044
   1045		/* Hold nets_rwsem so that any other thread modifying this
   1046		 * system param can sync with this thread.
   1047		 */
   1048		down_read(&rdma_nets_rwsem);
   1049		xa_for_each (&dev->compat_devs, c_index, cdev)
   1050			remove_one_compat_dev(dev, c_index);
   1051		up_read(&rdma_nets_rwsem);
   1052	}
   1053	up_read(&devices_rwsem);
   1054}
   1055
   1056static int add_all_compat_devs(void)
   1057{
   1058	struct rdma_dev_net *rnet;
   1059	struct ib_device *dev;
   1060	unsigned long index;
   1061	int ret = 0;
   1062
   1063	down_read(&devices_rwsem);
   1064	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
   1065		unsigned long net_index = 0;
   1066
   1067		/* Hold nets_rwsem so that any other thread modifying this
   1068		 * system param can sync with this thread.
   1069		 */
   1070		down_read(&rdma_nets_rwsem);
   1071		xa_for_each (&rdma_nets, net_index, rnet) {
   1072			ret = add_one_compat_dev(dev, rnet);
   1073			if (ret)
   1074				break;
   1075		}
   1076		up_read(&rdma_nets_rwsem);
   1077	}
   1078	up_read(&devices_rwsem);
   1079	if (ret)
   1080		remove_all_compat_devs();
   1081	return ret;
   1082}
   1083
   1084int rdma_compatdev_set(u8 enable)
   1085{
   1086	struct rdma_dev_net *rnet;
   1087	unsigned long index;
   1088	int ret = 0;
   1089
   1090	down_write(&rdma_nets_rwsem);
   1091	if (ib_devices_shared_netns == enable) {
   1092		up_write(&rdma_nets_rwsem);
   1093		return 0;
   1094	}
   1095
   1096	/* enable/disable of compat devices is not supported
   1097	 * when more than default init_net exists.
   1098	 */
   1099	xa_for_each (&rdma_nets, index, rnet) {
   1100		ret++;
   1101		break;
   1102	}
   1103	if (!ret)
   1104		ib_devices_shared_netns = enable;
   1105	up_write(&rdma_nets_rwsem);
   1106	if (ret)
   1107		return -EBUSY;
   1108
   1109	if (enable)
   1110		ret = add_all_compat_devs();
   1111	else
   1112		remove_all_compat_devs();
   1113	return ret;
   1114}
   1115
   1116static void rdma_dev_exit_net(struct net *net)
   1117{
   1118	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
   1119	struct ib_device *dev;
   1120	unsigned long index;
   1121	int ret;
   1122
   1123	down_write(&rdma_nets_rwsem);
   1124	/*
   1125	 * Prevent the ID from being re-used and hide the id from xa_for_each.
   1126	 */
   1127	ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
   1128	WARN_ON(ret);
   1129	up_write(&rdma_nets_rwsem);
   1130
   1131	down_read(&devices_rwsem);
   1132	xa_for_each (&devices, index, dev) {
   1133		get_device(&dev->dev);
   1134		/*
   1135		 * Release the devices_rwsem so that pontentially blocking
   1136		 * device_del, doesn't hold the devices_rwsem for too long.
   1137		 */
   1138		up_read(&devices_rwsem);
   1139
   1140		remove_one_compat_dev(dev, rnet->id);
   1141
   1142		/*
   1143		 * If the real device is in the NS then move it back to init.
   1144		 */
   1145		rdma_dev_change_netns(dev, net, &init_net);
   1146
   1147		put_device(&dev->dev);
   1148		down_read(&devices_rwsem);
   1149	}
   1150	up_read(&devices_rwsem);
   1151
   1152	rdma_nl_net_exit(rnet);
   1153	xa_erase(&rdma_nets, rnet->id);
   1154}
   1155
   1156static __net_init int rdma_dev_init_net(struct net *net)
   1157{
   1158	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
   1159	unsigned long index;
   1160	struct ib_device *dev;
   1161	int ret;
   1162
   1163	write_pnet(&rnet->net, net);
   1164
   1165	ret = rdma_nl_net_init(rnet);
   1166	if (ret)
   1167		return ret;
   1168
   1169	/* No need to create any compat devices in default init_net. */
   1170	if (net_eq(net, &init_net))
   1171		return 0;
   1172
   1173	ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
   1174	if (ret) {
   1175		rdma_nl_net_exit(rnet);
   1176		return ret;
   1177	}
   1178
   1179	down_read(&devices_rwsem);
   1180	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
   1181		/* Hold nets_rwsem so that netlink command cannot change
   1182		 * system configuration for device sharing mode.
   1183		 */
   1184		down_read(&rdma_nets_rwsem);
   1185		ret = add_one_compat_dev(dev, rnet);
   1186		up_read(&rdma_nets_rwsem);
   1187		if (ret)
   1188			break;
   1189	}
   1190	up_read(&devices_rwsem);
   1191
   1192	if (ret)
   1193		rdma_dev_exit_net(net);
   1194
   1195	return ret;
   1196}
   1197
   1198/*
   1199 * Assign the unique string device name and the unique device index. This is
   1200 * undone by ib_dealloc_device.
   1201 */
   1202static int assign_name(struct ib_device *device, const char *name)
   1203{
   1204	static u32 last_id;
   1205	int ret;
   1206
   1207	down_write(&devices_rwsem);
   1208	/* Assign a unique name to the device */
   1209	if (strchr(name, '%'))
   1210		ret = alloc_name(device, name);
   1211	else
   1212		ret = dev_set_name(&device->dev, name);
   1213	if (ret)
   1214		goto out;
   1215
   1216	if (__ib_device_get_by_name(dev_name(&device->dev))) {
   1217		ret = -ENFILE;
   1218		goto out;
   1219	}
   1220	strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
   1221
   1222	ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
   1223			&last_id, GFP_KERNEL);
   1224	if (ret > 0)
   1225		ret = 0;
   1226
   1227out:
   1228	up_write(&devices_rwsem);
   1229	return ret;
   1230}
   1231
   1232/*
   1233 * setup_device() allocates memory and sets up data that requires calling the
   1234 * device ops, this is the only reason these actions are not done during
   1235 * ib_alloc_device. It is undone by ib_dealloc_device().
   1236 */
   1237static int setup_device(struct ib_device *device)
   1238{
   1239	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
   1240	int ret;
   1241
   1242	ib_device_check_mandatory(device);
   1243
   1244	ret = setup_port_data(device);
   1245	if (ret) {
   1246		dev_warn(&device->dev, "Couldn't create per-port data\n");
   1247		return ret;
   1248	}
   1249
   1250	memset(&device->attrs, 0, sizeof(device->attrs));
   1251	ret = device->ops.query_device(device, &device->attrs, &uhw);
   1252	if (ret) {
   1253		dev_warn(&device->dev,
   1254			 "Couldn't query the device attributes\n");
   1255		return ret;
   1256	}
   1257
   1258	return 0;
   1259}
   1260
   1261static void disable_device(struct ib_device *device)
   1262{
   1263	u32 cid;
   1264
   1265	WARN_ON(!refcount_read(&device->refcount));
   1266
   1267	down_write(&devices_rwsem);
   1268	xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
   1269	up_write(&devices_rwsem);
   1270
   1271	/*
   1272	 * Remove clients in LIFO order, see assign_client_id. This could be
   1273	 * more efficient if xarray learns to reverse iterate. Since no new
   1274	 * clients can be added to this ib_device past this point we only need
   1275	 * the maximum possible client_id value here.
   1276	 */
   1277	down_read(&clients_rwsem);
   1278	cid = highest_client_id;
   1279	up_read(&clients_rwsem);
   1280	while (cid) {
   1281		cid--;
   1282		remove_client_context(device, cid);
   1283	}
   1284
   1285	ib_cq_pool_cleanup(device);
   1286
   1287	/* Pairs with refcount_set in enable_device */
   1288	ib_device_put(device);
   1289	wait_for_completion(&device->unreg_completion);
   1290
   1291	/*
   1292	 * compat devices must be removed after device refcount drops to zero.
   1293	 * Otherwise init_net() may add more compatdevs after removing compat
   1294	 * devices and before device is disabled.
   1295	 */
   1296	remove_compat_devs(device);
   1297}
   1298
   1299/*
   1300 * An enabled device is visible to all clients and to all the public facing
   1301 * APIs that return a device pointer. This always returns with a new get, even
   1302 * if it fails.
   1303 */
   1304static int enable_device_and_get(struct ib_device *device)
   1305{
   1306	struct ib_client *client;
   1307	unsigned long index;
   1308	int ret = 0;
   1309
   1310	/*
   1311	 * One ref belongs to the xa and the other belongs to this
   1312	 * thread. This is needed to guard against parallel unregistration.
   1313	 */
   1314	refcount_set(&device->refcount, 2);
   1315	down_write(&devices_rwsem);
   1316	xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
   1317
   1318	/*
   1319	 * By using downgrade_write() we ensure that no other thread can clear
   1320	 * DEVICE_REGISTERED while we are completing the client setup.
   1321	 */
   1322	downgrade_write(&devices_rwsem);
   1323
   1324	if (device->ops.enable_driver) {
   1325		ret = device->ops.enable_driver(device);
   1326		if (ret)
   1327			goto out;
   1328	}
   1329
   1330	down_read(&clients_rwsem);
   1331	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
   1332		ret = add_client_context(device, client);
   1333		if (ret)
   1334			break;
   1335	}
   1336	up_read(&clients_rwsem);
   1337	if (!ret)
   1338		ret = add_compat_devs(device);
   1339out:
   1340	up_read(&devices_rwsem);
   1341	return ret;
   1342}
   1343
   1344static void prevent_dealloc_device(struct ib_device *ib_dev)
   1345{
   1346}
   1347
   1348/**
   1349 * ib_register_device - Register an IB device with IB core
   1350 * @device: Device to register
   1351 * @name: unique string device name. This may include a '%' which will
   1352 * 	  cause a unique index to be added to the passed device name.
   1353 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB
   1354 *	        device will be used. In this case the caller should fully
   1355 *		setup the ibdev for DMA. This usually means using dma_virt_ops.
   1356 *
   1357 * Low-level drivers use ib_register_device() to register their
   1358 * devices with the IB core.  All registered clients will receive a
   1359 * callback for each device that is added. @device must be allocated
   1360 * with ib_alloc_device().
   1361 *
   1362 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
   1363 * asynchronously then the device pointer may become freed as soon as this
   1364 * function returns.
   1365 */
   1366int ib_register_device(struct ib_device *device, const char *name,
   1367		       struct device *dma_device)
   1368{
   1369	int ret;
   1370
   1371	ret = assign_name(device, name);
   1372	if (ret)
   1373		return ret;
   1374
   1375	/*
   1376	 * If the caller does not provide a DMA capable device then the IB core
   1377	 * will set up ib_sge and scatterlist structures that stash the kernel
   1378	 * virtual address into the address field.
   1379	 */
   1380	WARN_ON(dma_device && !dma_device->dma_parms);
   1381	device->dma_device = dma_device;
   1382
   1383	ret = setup_device(device);
   1384	if (ret)
   1385		return ret;
   1386
   1387	ret = ib_cache_setup_one(device);
   1388	if (ret) {
   1389		dev_warn(&device->dev,
   1390			 "Couldn't set up InfiniBand P_Key/GID cache\n");
   1391		return ret;
   1392	}
   1393
   1394	device->groups[0] = &ib_dev_attr_group;
   1395	device->groups[1] = device->ops.device_group;
   1396	ret = ib_setup_device_attrs(device);
   1397	if (ret)
   1398		goto cache_cleanup;
   1399
   1400	ib_device_register_rdmacg(device);
   1401
   1402	rdma_counter_init(device);
   1403
   1404	/*
   1405	 * Ensure that ADD uevent is not fired because it
   1406	 * is too early amd device is not initialized yet.
   1407	 */
   1408	dev_set_uevent_suppress(&device->dev, true);
   1409	ret = device_add(&device->dev);
   1410	if (ret)
   1411		goto cg_cleanup;
   1412
   1413	ret = ib_setup_port_attrs(&device->coredev);
   1414	if (ret) {
   1415		dev_warn(&device->dev,
   1416			 "Couldn't register device with driver model\n");
   1417		goto dev_cleanup;
   1418	}
   1419
   1420	ret = enable_device_and_get(device);
   1421	if (ret) {
   1422		void (*dealloc_fn)(struct ib_device *);
   1423
   1424		/*
   1425		 * If we hit this error flow then we don't want to
   1426		 * automatically dealloc the device since the caller is
   1427		 * expected to call ib_dealloc_device() after
   1428		 * ib_register_device() fails. This is tricky due to the
   1429		 * possibility for a parallel unregistration along with this
   1430		 * error flow. Since we have a refcount here we know any
   1431		 * parallel flow is stopped in disable_device and will see the
   1432		 * special dealloc_driver pointer, causing the responsibility to
   1433		 * ib_dealloc_device() to revert back to this thread.
   1434		 */
   1435		dealloc_fn = device->ops.dealloc_driver;
   1436		device->ops.dealloc_driver = prevent_dealloc_device;
   1437		ib_device_put(device);
   1438		__ib_unregister_device(device);
   1439		device->ops.dealloc_driver = dealloc_fn;
   1440		dev_set_uevent_suppress(&device->dev, false);
   1441		return ret;
   1442	}
   1443	dev_set_uevent_suppress(&device->dev, false);
   1444	/* Mark for userspace that device is ready */
   1445	kobject_uevent(&device->dev.kobj, KOBJ_ADD);
   1446	ib_device_put(device);
   1447
   1448	return 0;
   1449
   1450dev_cleanup:
   1451	device_del(&device->dev);
   1452cg_cleanup:
   1453	dev_set_uevent_suppress(&device->dev, false);
   1454	ib_device_unregister_rdmacg(device);
   1455cache_cleanup:
   1456	ib_cache_cleanup_one(device);
   1457	return ret;
   1458}
   1459EXPORT_SYMBOL(ib_register_device);
   1460
   1461/* Callers must hold a get on the device. */
   1462static void __ib_unregister_device(struct ib_device *ib_dev)
   1463{
   1464	/*
   1465	 * We have a registration lock so that all the calls to unregister are
   1466	 * fully fenced, once any unregister returns the device is truely
   1467	 * unregistered even if multiple callers are unregistering it at the
   1468	 * same time. This also interacts with the registration flow and
   1469	 * provides sane semantics if register and unregister are racing.
   1470	 */
   1471	mutex_lock(&ib_dev->unregistration_lock);
   1472	if (!refcount_read(&ib_dev->refcount))
   1473		goto out;
   1474
   1475	disable_device(ib_dev);
   1476
   1477	/* Expedite removing unregistered pointers from the hash table */
   1478	free_netdevs(ib_dev);
   1479
   1480	ib_free_port_attrs(&ib_dev->coredev);
   1481	device_del(&ib_dev->dev);
   1482	ib_device_unregister_rdmacg(ib_dev);
   1483	ib_cache_cleanup_one(ib_dev);
   1484
   1485	/*
   1486	 * Drivers using the new flow may not call ib_dealloc_device except
   1487	 * in error unwind prior to registration success.
   1488	 */
   1489	if (ib_dev->ops.dealloc_driver &&
   1490	    ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
   1491		WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
   1492		ib_dealloc_device(ib_dev);
   1493	}
   1494out:
   1495	mutex_unlock(&ib_dev->unregistration_lock);
   1496}
   1497
   1498/**
   1499 * ib_unregister_device - Unregister an IB device
   1500 * @ib_dev: The device to unregister
   1501 *
   1502 * Unregister an IB device.  All clients will receive a remove callback.
   1503 *
   1504 * Callers should call this routine only once, and protect against races with
   1505 * registration. Typically it should only be called as part of a remove
   1506 * callback in an implementation of driver core's struct device_driver and
   1507 * related.
   1508 *
   1509 * If ops.dealloc_driver is used then ib_dev will be freed upon return from
   1510 * this function.
   1511 */
   1512void ib_unregister_device(struct ib_device *ib_dev)
   1513{
   1514	get_device(&ib_dev->dev);
   1515	__ib_unregister_device(ib_dev);
   1516	put_device(&ib_dev->dev);
   1517}
   1518EXPORT_SYMBOL(ib_unregister_device);
   1519
   1520/**
   1521 * ib_unregister_device_and_put - Unregister a device while holding a 'get'
   1522 * @ib_dev: The device to unregister
   1523 *
   1524 * This is the same as ib_unregister_device(), except it includes an internal
   1525 * ib_device_put() that should match a 'get' obtained by the caller.
   1526 *
   1527 * It is safe to call this routine concurrently from multiple threads while
   1528 * holding the 'get'. When the function returns the device is fully
   1529 * unregistered.
   1530 *
   1531 * Drivers using this flow MUST use the driver_unregister callback to clean up
   1532 * their resources associated with the device and dealloc it.
   1533 */
   1534void ib_unregister_device_and_put(struct ib_device *ib_dev)
   1535{
   1536	WARN_ON(!ib_dev->ops.dealloc_driver);
   1537	get_device(&ib_dev->dev);
   1538	ib_device_put(ib_dev);
   1539	__ib_unregister_device(ib_dev);
   1540	put_device(&ib_dev->dev);
   1541}
   1542EXPORT_SYMBOL(ib_unregister_device_and_put);
   1543
   1544/**
   1545 * ib_unregister_driver - Unregister all IB devices for a driver
   1546 * @driver_id: The driver to unregister
   1547 *
   1548 * This implements a fence for device unregistration. It only returns once all
   1549 * devices associated with the driver_id have fully completed their
   1550 * unregistration and returned from ib_unregister_device*().
   1551 *
   1552 * If device's are not yet unregistered it goes ahead and starts unregistering
   1553 * them.
   1554 *
   1555 * This does not block creation of new devices with the given driver_id, that
   1556 * is the responsibility of the caller.
   1557 */
   1558void ib_unregister_driver(enum rdma_driver_id driver_id)
   1559{
   1560	struct ib_device *ib_dev;
   1561	unsigned long index;
   1562
   1563	down_read(&devices_rwsem);
   1564	xa_for_each (&devices, index, ib_dev) {
   1565		if (ib_dev->ops.driver_id != driver_id)
   1566			continue;
   1567
   1568		get_device(&ib_dev->dev);
   1569		up_read(&devices_rwsem);
   1570
   1571		WARN_ON(!ib_dev->ops.dealloc_driver);
   1572		__ib_unregister_device(ib_dev);
   1573
   1574		put_device(&ib_dev->dev);
   1575		down_read(&devices_rwsem);
   1576	}
   1577	up_read(&devices_rwsem);
   1578}
   1579EXPORT_SYMBOL(ib_unregister_driver);
   1580
   1581static void ib_unregister_work(struct work_struct *work)
   1582{
   1583	struct ib_device *ib_dev =
   1584		container_of(work, struct ib_device, unregistration_work);
   1585
   1586	__ib_unregister_device(ib_dev);
   1587	put_device(&ib_dev->dev);
   1588}
   1589
   1590/**
   1591 * ib_unregister_device_queued - Unregister a device using a work queue
   1592 * @ib_dev: The device to unregister
   1593 *
   1594 * This schedules an asynchronous unregistration using a WQ for the device. A
   1595 * driver should use this to avoid holding locks while doing unregistration,
   1596 * such as holding the RTNL lock.
   1597 *
   1598 * Drivers using this API must use ib_unregister_driver before module unload
   1599 * to ensure that all scheduled unregistrations have completed.
   1600 */
   1601void ib_unregister_device_queued(struct ib_device *ib_dev)
   1602{
   1603	WARN_ON(!refcount_read(&ib_dev->refcount));
   1604	WARN_ON(!ib_dev->ops.dealloc_driver);
   1605	get_device(&ib_dev->dev);
   1606	if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work))
   1607		put_device(&ib_dev->dev);
   1608}
   1609EXPORT_SYMBOL(ib_unregister_device_queued);
   1610
   1611/*
   1612 * The caller must pass in a device that has the kref held and the refcount
   1613 * released. If the device is in cur_net and still registered then it is moved
   1614 * into net.
   1615 */
   1616static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
   1617				 struct net *net)
   1618{
   1619	int ret2 = -EINVAL;
   1620	int ret;
   1621
   1622	mutex_lock(&device->unregistration_lock);
   1623
   1624	/*
   1625	 * If a device not under ib_device_get() or if the unregistration_lock
   1626	 * is not held, the namespace can be changed, or it can be unregistered.
   1627	 * Check again under the lock.
   1628	 */
   1629	if (refcount_read(&device->refcount) == 0 ||
   1630	    !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
   1631		ret = -ENODEV;
   1632		goto out;
   1633	}
   1634
   1635	kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
   1636	disable_device(device);
   1637
   1638	/*
   1639	 * At this point no one can be using the device, so it is safe to
   1640	 * change the namespace.
   1641	 */
   1642	write_pnet(&device->coredev.rdma_net, net);
   1643
   1644	down_read(&devices_rwsem);
   1645	/*
   1646	 * Currently rdma devices are system wide unique. So the device name
   1647	 * is guaranteed free in the new namespace. Publish the new namespace
   1648	 * at the sysfs level.
   1649	 */
   1650	ret = device_rename(&device->dev, dev_name(&device->dev));
   1651	up_read(&devices_rwsem);
   1652	if (ret) {
   1653		dev_warn(&device->dev,
   1654			 "%s: Couldn't rename device after namespace change\n",
   1655			 __func__);
   1656		/* Try and put things back and re-enable the device */
   1657		write_pnet(&device->coredev.rdma_net, cur_net);
   1658	}
   1659
   1660	ret2 = enable_device_and_get(device);
   1661	if (ret2) {
   1662		/*
   1663		 * This shouldn't really happen, but if it does, let the user
   1664		 * retry at later point. So don't disable the device.
   1665		 */
   1666		dev_warn(&device->dev,
   1667			 "%s: Couldn't re-enable device after namespace change\n",
   1668			 __func__);
   1669	}
   1670	kobject_uevent(&device->dev.kobj, KOBJ_ADD);
   1671
   1672	ib_device_put(device);
   1673out:
   1674	mutex_unlock(&device->unregistration_lock);
   1675	if (ret)
   1676		return ret;
   1677	return ret2;
   1678}
   1679
   1680int ib_device_set_netns_put(struct sk_buff *skb,
   1681			    struct ib_device *dev, u32 ns_fd)
   1682{
   1683	struct net *net;
   1684	int ret;
   1685
   1686	net = get_net_ns_by_fd(ns_fd);
   1687	if (IS_ERR(net)) {
   1688		ret = PTR_ERR(net);
   1689		goto net_err;
   1690	}
   1691
   1692	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
   1693		ret = -EPERM;
   1694		goto ns_err;
   1695	}
   1696
   1697	/*
   1698	 * All the ib_clients, including uverbs, are reset when the namespace is
   1699	 * changed and this cannot be blocked waiting for userspace to do
   1700	 * something, so disassociation is mandatory.
   1701	 */
   1702	if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) {
   1703		ret = -EOPNOTSUPP;
   1704		goto ns_err;
   1705	}
   1706
   1707	get_device(&dev->dev);
   1708	ib_device_put(dev);
   1709	ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
   1710	put_device(&dev->dev);
   1711
   1712	put_net(net);
   1713	return ret;
   1714
   1715ns_err:
   1716	put_net(net);
   1717net_err:
   1718	ib_device_put(dev);
   1719	return ret;
   1720}
   1721
   1722static struct pernet_operations rdma_dev_net_ops = {
   1723	.init = rdma_dev_init_net,
   1724	.exit = rdma_dev_exit_net,
   1725	.id = &rdma_dev_net_id,
   1726	.size = sizeof(struct rdma_dev_net),
   1727};
   1728
   1729static int assign_client_id(struct ib_client *client)
   1730{
   1731	int ret;
   1732
   1733	down_write(&clients_rwsem);
   1734	/*
   1735	 * The add/remove callbacks must be called in FIFO/LIFO order. To
   1736	 * achieve this we assign client_ids so they are sorted in
   1737	 * registration order.
   1738	 */
   1739	client->client_id = highest_client_id;
   1740	ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
   1741	if (ret)
   1742		goto out;
   1743
   1744	highest_client_id++;
   1745	xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
   1746
   1747out:
   1748	up_write(&clients_rwsem);
   1749	return ret;
   1750}
   1751
   1752static void remove_client_id(struct ib_client *client)
   1753{
   1754	down_write(&clients_rwsem);
   1755	xa_erase(&clients, client->client_id);
   1756	for (; highest_client_id; highest_client_id--)
   1757		if (xa_load(&clients, highest_client_id - 1))
   1758			break;
   1759	up_write(&clients_rwsem);
   1760}
   1761
   1762/**
   1763 * ib_register_client - Register an IB client
   1764 * @client:Client to register
   1765 *
   1766 * Upper level users of the IB drivers can use ib_register_client() to
   1767 * register callbacks for IB device addition and removal.  When an IB
   1768 * device is added, each registered client's add method will be called
   1769 * (in the order the clients were registered), and when a device is
   1770 * removed, each client's remove method will be called (in the reverse
   1771 * order that clients were registered).  In addition, when
   1772 * ib_register_client() is called, the client will receive an add
   1773 * callback for all devices already registered.
   1774 */
   1775int ib_register_client(struct ib_client *client)
   1776{
   1777	struct ib_device *device;
   1778	unsigned long index;
   1779	int ret;
   1780
   1781	refcount_set(&client->uses, 1);
   1782	init_completion(&client->uses_zero);
   1783	ret = assign_client_id(client);
   1784	if (ret)
   1785		return ret;
   1786
   1787	down_read(&devices_rwsem);
   1788	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
   1789		ret = add_client_context(device, client);
   1790		if (ret) {
   1791			up_read(&devices_rwsem);
   1792			ib_unregister_client(client);
   1793			return ret;
   1794		}
   1795	}
   1796	up_read(&devices_rwsem);
   1797	return 0;
   1798}
   1799EXPORT_SYMBOL(ib_register_client);
   1800
   1801/**
   1802 * ib_unregister_client - Unregister an IB client
   1803 * @client:Client to unregister
   1804 *
   1805 * Upper level users use ib_unregister_client() to remove their client
   1806 * registration.  When ib_unregister_client() is called, the client
   1807 * will receive a remove callback for each IB device still registered.
   1808 *
   1809 * This is a full fence, once it returns no client callbacks will be called,
   1810 * or are running in another thread.
   1811 */
   1812void ib_unregister_client(struct ib_client *client)
   1813{
   1814	struct ib_device *device;
   1815	unsigned long index;
   1816
   1817	down_write(&clients_rwsem);
   1818	ib_client_put(client);
   1819	xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
   1820	up_write(&clients_rwsem);
   1821
   1822	/* We do not want to have locks while calling client->remove() */
   1823	rcu_read_lock();
   1824	xa_for_each (&devices, index, device) {
   1825		if (!ib_device_try_get(device))
   1826			continue;
   1827		rcu_read_unlock();
   1828
   1829		remove_client_context(device, client->client_id);
   1830
   1831		ib_device_put(device);
   1832		rcu_read_lock();
   1833	}
   1834	rcu_read_unlock();
   1835
   1836	/*
   1837	 * remove_client_context() is not a fence, it can return even though a
   1838	 * removal is ongoing. Wait until all removals are completed.
   1839	 */
   1840	wait_for_completion(&client->uses_zero);
   1841	remove_client_id(client);
   1842}
   1843EXPORT_SYMBOL(ib_unregister_client);
   1844
   1845static int __ib_get_global_client_nl_info(const char *client_name,
   1846					  struct ib_client_nl_info *res)
   1847{
   1848	struct ib_client *client;
   1849	unsigned long index;
   1850	int ret = -ENOENT;
   1851
   1852	down_read(&clients_rwsem);
   1853	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
   1854		if (strcmp(client->name, client_name) != 0)
   1855			continue;
   1856		if (!client->get_global_nl_info) {
   1857			ret = -EOPNOTSUPP;
   1858			break;
   1859		}
   1860		ret = client->get_global_nl_info(res);
   1861		if (WARN_ON(ret == -ENOENT))
   1862			ret = -EINVAL;
   1863		if (!ret && res->cdev)
   1864			get_device(res->cdev);
   1865		break;
   1866	}
   1867	up_read(&clients_rwsem);
   1868	return ret;
   1869}
   1870
   1871static int __ib_get_client_nl_info(struct ib_device *ibdev,
   1872				   const char *client_name,
   1873				   struct ib_client_nl_info *res)
   1874{
   1875	unsigned long index;
   1876	void *client_data;
   1877	int ret = -ENOENT;
   1878
   1879	down_read(&ibdev->client_data_rwsem);
   1880	xan_for_each_marked (&ibdev->client_data, index, client_data,
   1881			     CLIENT_DATA_REGISTERED) {
   1882		struct ib_client *client = xa_load(&clients, index);
   1883
   1884		if (!client || strcmp(client->name, client_name) != 0)
   1885			continue;
   1886		if (!client->get_nl_info) {
   1887			ret = -EOPNOTSUPP;
   1888			break;
   1889		}
   1890		ret = client->get_nl_info(ibdev, client_data, res);
   1891		if (WARN_ON(ret == -ENOENT))
   1892			ret = -EINVAL;
   1893
   1894		/*
   1895		 * The cdev is guaranteed valid as long as we are inside the
   1896		 * client_data_rwsem as remove_one can't be called. Keep it
   1897		 * valid for the caller.
   1898		 */
   1899		if (!ret && res->cdev)
   1900			get_device(res->cdev);
   1901		break;
   1902	}
   1903	up_read(&ibdev->client_data_rwsem);
   1904
   1905	return ret;
   1906}
   1907
   1908/**
   1909 * ib_get_client_nl_info - Fetch the nl_info from a client
   1910 * @ibdev: IB device
   1911 * @client_name: Name of the client
   1912 * @res: Result of the query
   1913 */
   1914int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
   1915			  struct ib_client_nl_info *res)
   1916{
   1917	int ret;
   1918
   1919	if (ibdev)
   1920		ret = __ib_get_client_nl_info(ibdev, client_name, res);
   1921	else
   1922		ret = __ib_get_global_client_nl_info(client_name, res);
   1923#ifdef CONFIG_MODULES
   1924	if (ret == -ENOENT) {
   1925		request_module("rdma-client-%s", client_name);
   1926		if (ibdev)
   1927			ret = __ib_get_client_nl_info(ibdev, client_name, res);
   1928		else
   1929			ret = __ib_get_global_client_nl_info(client_name, res);
   1930	}
   1931#endif
   1932	if (ret) {
   1933		if (ret == -ENOENT)
   1934			return -EOPNOTSUPP;
   1935		return ret;
   1936	}
   1937
   1938	if (WARN_ON(!res->cdev))
   1939		return -EINVAL;
   1940	return 0;
   1941}
   1942
   1943/**
   1944 * ib_set_client_data - Set IB client context
   1945 * @device:Device to set context for
   1946 * @client:Client to set context for
   1947 * @data:Context to set
   1948 *
   1949 * ib_set_client_data() sets client context data that can be retrieved with
   1950 * ib_get_client_data(). This can only be called while the client is
   1951 * registered to the device, once the ib_client remove() callback returns this
   1952 * cannot be called.
   1953 */
   1954void ib_set_client_data(struct ib_device *device, struct ib_client *client,
   1955			void *data)
   1956{
   1957	void *rc;
   1958
   1959	if (WARN_ON(IS_ERR(data)))
   1960		data = NULL;
   1961
   1962	rc = xa_store(&device->client_data, client->client_id, data,
   1963		      GFP_KERNEL);
   1964	WARN_ON(xa_is_err(rc));
   1965}
   1966EXPORT_SYMBOL(ib_set_client_data);
   1967
   1968/**
   1969 * ib_register_event_handler - Register an IB event handler
   1970 * @event_handler:Handler to register
   1971 *
   1972 * ib_register_event_handler() registers an event handler that will be
   1973 * called back when asynchronous IB events occur (as defined in
   1974 * chapter 11 of the InfiniBand Architecture Specification). This
   1975 * callback occurs in workqueue context.
   1976 */
   1977void ib_register_event_handler(struct ib_event_handler *event_handler)
   1978{
   1979	down_write(&event_handler->device->event_handler_rwsem);
   1980	list_add_tail(&event_handler->list,
   1981		      &event_handler->device->event_handler_list);
   1982	up_write(&event_handler->device->event_handler_rwsem);
   1983}
   1984EXPORT_SYMBOL(ib_register_event_handler);
   1985
   1986/**
   1987 * ib_unregister_event_handler - Unregister an event handler
   1988 * @event_handler:Handler to unregister
   1989 *
   1990 * Unregister an event handler registered with
   1991 * ib_register_event_handler().
   1992 */
   1993void ib_unregister_event_handler(struct ib_event_handler *event_handler)
   1994{
   1995	down_write(&event_handler->device->event_handler_rwsem);
   1996	list_del(&event_handler->list);
   1997	up_write(&event_handler->device->event_handler_rwsem);
   1998}
   1999EXPORT_SYMBOL(ib_unregister_event_handler);
   2000
   2001void ib_dispatch_event_clients(struct ib_event *event)
   2002{
   2003	struct ib_event_handler *handler;
   2004
   2005	down_read(&event->device->event_handler_rwsem);
   2006
   2007	list_for_each_entry(handler, &event->device->event_handler_list, list)
   2008		handler->handler(handler, event);
   2009
   2010	up_read(&event->device->event_handler_rwsem);
   2011}
   2012
   2013static int iw_query_port(struct ib_device *device,
   2014			   u32 port_num,
   2015			   struct ib_port_attr *port_attr)
   2016{
   2017	struct in_device *inetdev;
   2018	struct net_device *netdev;
   2019
   2020	memset(port_attr, 0, sizeof(*port_attr));
   2021
   2022	netdev = ib_device_get_netdev(device, port_num);
   2023	if (!netdev)
   2024		return -ENODEV;
   2025
   2026	port_attr->max_mtu = IB_MTU_4096;
   2027	port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
   2028
   2029	if (!netif_carrier_ok(netdev)) {
   2030		port_attr->state = IB_PORT_DOWN;
   2031		port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
   2032	} else {
   2033		rcu_read_lock();
   2034		inetdev = __in_dev_get_rcu(netdev);
   2035
   2036		if (inetdev && inetdev->ifa_list) {
   2037			port_attr->state = IB_PORT_ACTIVE;
   2038			port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
   2039		} else {
   2040			port_attr->state = IB_PORT_INIT;
   2041			port_attr->phys_state =
   2042				IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
   2043		}
   2044
   2045		rcu_read_unlock();
   2046	}
   2047
   2048	dev_put(netdev);
   2049	return device->ops.query_port(device, port_num, port_attr);
   2050}
   2051
   2052static int __ib_query_port(struct ib_device *device,
   2053			   u32 port_num,
   2054			   struct ib_port_attr *port_attr)
   2055{
   2056	int err;
   2057
   2058	memset(port_attr, 0, sizeof(*port_attr));
   2059
   2060	err = device->ops.query_port(device, port_num, port_attr);
   2061	if (err || port_attr->subnet_prefix)
   2062		return err;
   2063
   2064	if (rdma_port_get_link_layer(device, port_num) !=
   2065	    IB_LINK_LAYER_INFINIBAND)
   2066		return 0;
   2067
   2068	ib_get_cached_subnet_prefix(device, port_num,
   2069				    &port_attr->subnet_prefix);
   2070	return 0;
   2071}
   2072
   2073/**
   2074 * ib_query_port - Query IB port attributes
   2075 * @device:Device to query
   2076 * @port_num:Port number to query
   2077 * @port_attr:Port attributes
   2078 *
   2079 * ib_query_port() returns the attributes of a port through the
   2080 * @port_attr pointer.
   2081 */
   2082int ib_query_port(struct ib_device *device,
   2083		  u32 port_num,
   2084		  struct ib_port_attr *port_attr)
   2085{
   2086	if (!rdma_is_port_valid(device, port_num))
   2087		return -EINVAL;
   2088
   2089	if (rdma_protocol_iwarp(device, port_num))
   2090		return iw_query_port(device, port_num, port_attr);
   2091	else
   2092		return __ib_query_port(device, port_num, port_attr);
   2093}
   2094EXPORT_SYMBOL(ib_query_port);
   2095
   2096static void add_ndev_hash(struct ib_port_data *pdata)
   2097{
   2098	unsigned long flags;
   2099
   2100	might_sleep();
   2101
   2102	spin_lock_irqsave(&ndev_hash_lock, flags);
   2103	if (hash_hashed(&pdata->ndev_hash_link)) {
   2104		hash_del_rcu(&pdata->ndev_hash_link);
   2105		spin_unlock_irqrestore(&ndev_hash_lock, flags);
   2106		/*
   2107		 * We cannot do hash_add_rcu after a hash_del_rcu until the
   2108		 * grace period
   2109		 */
   2110		synchronize_rcu();
   2111		spin_lock_irqsave(&ndev_hash_lock, flags);
   2112	}
   2113	if (pdata->netdev)
   2114		hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
   2115			     (uintptr_t)pdata->netdev);
   2116	spin_unlock_irqrestore(&ndev_hash_lock, flags);
   2117}
   2118
   2119/**
   2120 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
   2121 * @ib_dev: Device to modify
   2122 * @ndev: net_device to affiliate, may be NULL
   2123 * @port: IB port the net_device is connected to
   2124 *
   2125 * Drivers should use this to link the ib_device to a netdev so the netdev
   2126 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
   2127 * affiliated with any port.
   2128 *
   2129 * The caller must ensure that the given ndev is not unregistered or
   2130 * unregistering, and that either the ib_device is unregistered or
   2131 * ib_device_set_netdev() is called with NULL when the ndev sends a
   2132 * NETDEV_UNREGISTER event.
   2133 */
   2134int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
   2135			 u32 port)
   2136{
   2137	struct net_device *old_ndev;
   2138	struct ib_port_data *pdata;
   2139	unsigned long flags;
   2140	int ret;
   2141
   2142	/*
   2143	 * Drivers wish to call this before ib_register_driver, so we have to
   2144	 * setup the port data early.
   2145	 */
   2146	ret = alloc_port_data(ib_dev);
   2147	if (ret)
   2148		return ret;
   2149
   2150	if (!rdma_is_port_valid(ib_dev, port))
   2151		return -EINVAL;
   2152
   2153	pdata = &ib_dev->port_data[port];
   2154	spin_lock_irqsave(&pdata->netdev_lock, flags);
   2155	old_ndev = rcu_dereference_protected(
   2156		pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
   2157	if (old_ndev == ndev) {
   2158		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
   2159		return 0;
   2160	}
   2161
   2162	if (ndev)
   2163		dev_hold(ndev);
   2164	rcu_assign_pointer(pdata->netdev, ndev);
   2165	spin_unlock_irqrestore(&pdata->netdev_lock, flags);
   2166
   2167	add_ndev_hash(pdata);
   2168	if (old_ndev)
   2169		dev_put(old_ndev);
   2170
   2171	return 0;
   2172}
   2173EXPORT_SYMBOL(ib_device_set_netdev);
   2174
   2175static void free_netdevs(struct ib_device *ib_dev)
   2176{
   2177	unsigned long flags;
   2178	u32 port;
   2179
   2180	if (!ib_dev->port_data)
   2181		return;
   2182
   2183	rdma_for_each_port (ib_dev, port) {
   2184		struct ib_port_data *pdata = &ib_dev->port_data[port];
   2185		struct net_device *ndev;
   2186
   2187		spin_lock_irqsave(&pdata->netdev_lock, flags);
   2188		ndev = rcu_dereference_protected(
   2189			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
   2190		if (ndev) {
   2191			spin_lock(&ndev_hash_lock);
   2192			hash_del_rcu(&pdata->ndev_hash_link);
   2193			spin_unlock(&ndev_hash_lock);
   2194
   2195			/*
   2196			 * If this is the last dev_put there is still a
   2197			 * synchronize_rcu before the netdev is kfreed, so we
   2198			 * can continue to rely on unlocked pointer
   2199			 * comparisons after the put
   2200			 */
   2201			rcu_assign_pointer(pdata->netdev, NULL);
   2202			dev_put(ndev);
   2203		}
   2204		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
   2205	}
   2206}
   2207
   2208struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
   2209					u32 port)
   2210{
   2211	struct ib_port_data *pdata;
   2212	struct net_device *res;
   2213
   2214	if (!rdma_is_port_valid(ib_dev, port))
   2215		return NULL;
   2216
   2217	pdata = &ib_dev->port_data[port];
   2218
   2219	/*
   2220	 * New drivers should use ib_device_set_netdev() not the legacy
   2221	 * get_netdev().
   2222	 */
   2223	if (ib_dev->ops.get_netdev)
   2224		res = ib_dev->ops.get_netdev(ib_dev, port);
   2225	else {
   2226		spin_lock(&pdata->netdev_lock);
   2227		res = rcu_dereference_protected(
   2228			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
   2229		if (res)
   2230			dev_hold(res);
   2231		spin_unlock(&pdata->netdev_lock);
   2232	}
   2233
   2234	/*
   2235	 * If we are starting to unregister expedite things by preventing
   2236	 * propagation of an unregistering netdev.
   2237	 */
   2238	if (res && res->reg_state != NETREG_REGISTERED) {
   2239		dev_put(res);
   2240		return NULL;
   2241	}
   2242
   2243	return res;
   2244}
   2245
   2246/**
   2247 * ib_device_get_by_netdev - Find an IB device associated with a netdev
   2248 * @ndev: netdev to locate
   2249 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
   2250 *
   2251 * Find and hold an ib_device that is associated with a netdev via
   2252 * ib_device_set_netdev(). The caller must call ib_device_put() on the
   2253 * returned pointer.
   2254 */
   2255struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
   2256					  enum rdma_driver_id driver_id)
   2257{
   2258	struct ib_device *res = NULL;
   2259	struct ib_port_data *cur;
   2260
   2261	rcu_read_lock();
   2262	hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
   2263				    (uintptr_t)ndev) {
   2264		if (rcu_access_pointer(cur->netdev) == ndev &&
   2265		    (driver_id == RDMA_DRIVER_UNKNOWN ||
   2266		     cur->ib_dev->ops.driver_id == driver_id) &&
   2267		    ib_device_try_get(cur->ib_dev)) {
   2268			res = cur->ib_dev;
   2269			break;
   2270		}
   2271	}
   2272	rcu_read_unlock();
   2273
   2274	return res;
   2275}
   2276EXPORT_SYMBOL(ib_device_get_by_netdev);
   2277
   2278/**
   2279 * ib_enum_roce_netdev - enumerate all RoCE ports
   2280 * @ib_dev : IB device we want to query
   2281 * @filter: Should we call the callback?
   2282 * @filter_cookie: Cookie passed to filter
   2283 * @cb: Callback to call for each found RoCE ports
   2284 * @cookie: Cookie passed back to the callback
   2285 *
   2286 * Enumerates all of the physical RoCE ports of ib_dev
   2287 * which are related to netdevice and calls callback() on each
   2288 * device for which filter() function returns non zero.
   2289 */
   2290void ib_enum_roce_netdev(struct ib_device *ib_dev,
   2291			 roce_netdev_filter filter,
   2292			 void *filter_cookie,
   2293			 roce_netdev_callback cb,
   2294			 void *cookie)
   2295{
   2296	u32 port;
   2297
   2298	rdma_for_each_port (ib_dev, port)
   2299		if (rdma_protocol_roce(ib_dev, port)) {
   2300			struct net_device *idev =
   2301				ib_device_get_netdev(ib_dev, port);
   2302
   2303			if (filter(ib_dev, port, idev, filter_cookie))
   2304				cb(ib_dev, port, idev, cookie);
   2305
   2306			if (idev)
   2307				dev_put(idev);
   2308		}
   2309}
   2310
   2311/**
   2312 * ib_enum_all_roce_netdevs - enumerate all RoCE devices
   2313 * @filter: Should we call the callback?
   2314 * @filter_cookie: Cookie passed to filter
   2315 * @cb: Callback to call for each found RoCE ports
   2316 * @cookie: Cookie passed back to the callback
   2317 *
   2318 * Enumerates all RoCE devices' physical ports which are related
   2319 * to netdevices and calls callback() on each device for which
   2320 * filter() function returns non zero.
   2321 */
   2322void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
   2323			      void *filter_cookie,
   2324			      roce_netdev_callback cb,
   2325			      void *cookie)
   2326{
   2327	struct ib_device *dev;
   2328	unsigned long index;
   2329
   2330	down_read(&devices_rwsem);
   2331	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
   2332		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
   2333	up_read(&devices_rwsem);
   2334}
   2335
   2336/*
   2337 * ib_enum_all_devs - enumerate all ib_devices
   2338 * @cb: Callback to call for each found ib_device
   2339 *
   2340 * Enumerates all ib_devices and calls callback() on each device.
   2341 */
   2342int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
   2343		     struct netlink_callback *cb)
   2344{
   2345	unsigned long index;
   2346	struct ib_device *dev;
   2347	unsigned int idx = 0;
   2348	int ret = 0;
   2349
   2350	down_read(&devices_rwsem);
   2351	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
   2352		if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
   2353			continue;
   2354
   2355		ret = nldev_cb(dev, skb, cb, idx);
   2356		if (ret)
   2357			break;
   2358		idx++;
   2359	}
   2360	up_read(&devices_rwsem);
   2361	return ret;
   2362}
   2363
   2364/**
   2365 * ib_query_pkey - Get P_Key table entry
   2366 * @device:Device to query
   2367 * @port_num:Port number to query
   2368 * @index:P_Key table index to query
   2369 * @pkey:Returned P_Key
   2370 *
   2371 * ib_query_pkey() fetches the specified P_Key table entry.
   2372 */
   2373int ib_query_pkey(struct ib_device *device,
   2374		  u32 port_num, u16 index, u16 *pkey)
   2375{
   2376	if (!rdma_is_port_valid(device, port_num))
   2377		return -EINVAL;
   2378
   2379	if (!device->ops.query_pkey)
   2380		return -EOPNOTSUPP;
   2381
   2382	return device->ops.query_pkey(device, port_num, index, pkey);
   2383}
   2384EXPORT_SYMBOL(ib_query_pkey);
   2385
   2386/**
   2387 * ib_modify_device - Change IB device attributes
   2388 * @device:Device to modify
   2389 * @device_modify_mask:Mask of attributes to change
   2390 * @device_modify:New attribute values
   2391 *
   2392 * ib_modify_device() changes a device's attributes as specified by
   2393 * the @device_modify_mask and @device_modify structure.
   2394 */
   2395int ib_modify_device(struct ib_device *device,
   2396		     int device_modify_mask,
   2397		     struct ib_device_modify *device_modify)
   2398{
   2399	if (!device->ops.modify_device)
   2400		return -EOPNOTSUPP;
   2401
   2402	return device->ops.modify_device(device, device_modify_mask,
   2403					 device_modify);
   2404}
   2405EXPORT_SYMBOL(ib_modify_device);
   2406
   2407/**
   2408 * ib_modify_port - Modifies the attributes for the specified port.
   2409 * @device: The device to modify.
   2410 * @port_num: The number of the port to modify.
   2411 * @port_modify_mask: Mask used to specify which attributes of the port
   2412 *   to change.
   2413 * @port_modify: New attribute values for the port.
   2414 *
   2415 * ib_modify_port() changes a port's attributes as specified by the
   2416 * @port_modify_mask and @port_modify structure.
   2417 */
   2418int ib_modify_port(struct ib_device *device,
   2419		   u32 port_num, int port_modify_mask,
   2420		   struct ib_port_modify *port_modify)
   2421{
   2422	int rc;
   2423
   2424	if (!rdma_is_port_valid(device, port_num))
   2425		return -EINVAL;
   2426
   2427	if (device->ops.modify_port)
   2428		rc = device->ops.modify_port(device, port_num,
   2429					     port_modify_mask,
   2430					     port_modify);
   2431	else if (rdma_protocol_roce(device, port_num) &&
   2432		 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 ||
   2433		  (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0))
   2434		rc = 0;
   2435	else
   2436		rc = -EOPNOTSUPP;
   2437	return rc;
   2438}
   2439EXPORT_SYMBOL(ib_modify_port);
   2440
   2441/**
   2442 * ib_find_gid - Returns the port number and GID table index where
   2443 *   a specified GID value occurs. Its searches only for IB link layer.
   2444 * @device: The device to query.
   2445 * @gid: The GID value to search for.
   2446 * @port_num: The port number of the device where the GID value was found.
   2447 * @index: The index into the GID table where the GID was found.  This
   2448 *   parameter may be NULL.
   2449 */
   2450int ib_find_gid(struct ib_device *device, union ib_gid *gid,
   2451		u32 *port_num, u16 *index)
   2452{
   2453	union ib_gid tmp_gid;
   2454	u32 port;
   2455	int ret, i;
   2456
   2457	rdma_for_each_port (device, port) {
   2458		if (!rdma_protocol_ib(device, port))
   2459			continue;
   2460
   2461		for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
   2462		     ++i) {
   2463			ret = rdma_query_gid(device, port, i, &tmp_gid);
   2464			if (ret)
   2465				continue;
   2466
   2467			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
   2468				*port_num = port;
   2469				if (index)
   2470					*index = i;
   2471				return 0;
   2472			}
   2473		}
   2474	}
   2475
   2476	return -ENOENT;
   2477}
   2478EXPORT_SYMBOL(ib_find_gid);
   2479
   2480/**
   2481 * ib_find_pkey - Returns the PKey table index where a specified
   2482 *   PKey value occurs.
   2483 * @device: The device to query.
   2484 * @port_num: The port number of the device to search for the PKey.
   2485 * @pkey: The PKey value to search for.
   2486 * @index: The index into the PKey table where the PKey was found.
   2487 */
   2488int ib_find_pkey(struct ib_device *device,
   2489		 u32 port_num, u16 pkey, u16 *index)
   2490{
   2491	int ret, i;
   2492	u16 tmp_pkey;
   2493	int partial_ix = -1;
   2494
   2495	for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
   2496	     ++i) {
   2497		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
   2498		if (ret)
   2499			return ret;
   2500		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
   2501			/* if there is full-member pkey take it.*/
   2502			if (tmp_pkey & 0x8000) {
   2503				*index = i;
   2504				return 0;
   2505			}
   2506			if (partial_ix < 0)
   2507				partial_ix = i;
   2508		}
   2509	}
   2510
   2511	/*no full-member, if exists take the limited*/
   2512	if (partial_ix >= 0) {
   2513		*index = partial_ix;
   2514		return 0;
   2515	}
   2516	return -ENOENT;
   2517}
   2518EXPORT_SYMBOL(ib_find_pkey);
   2519
   2520/**
   2521 * ib_get_net_dev_by_params() - Return the appropriate net_dev
   2522 * for a received CM request
   2523 * @dev:	An RDMA device on which the request has been received.
   2524 * @port:	Port number on the RDMA device.
   2525 * @pkey:	The Pkey the request came on.
   2526 * @gid:	A GID that the net_dev uses to communicate.
   2527 * @addr:	Contains the IP address that the request specified as its
   2528 *		destination.
   2529 *
   2530 */
   2531struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
   2532					    u32 port,
   2533					    u16 pkey,
   2534					    const union ib_gid *gid,
   2535					    const struct sockaddr *addr)
   2536{
   2537	struct net_device *net_dev = NULL;
   2538	unsigned long index;
   2539	void *client_data;
   2540
   2541	if (!rdma_protocol_ib(dev, port))
   2542		return NULL;
   2543
   2544	/*
   2545	 * Holding the read side guarantees that the client will not become
   2546	 * unregistered while we are calling get_net_dev_by_params()
   2547	 */
   2548	down_read(&dev->client_data_rwsem);
   2549	xan_for_each_marked (&dev->client_data, index, client_data,
   2550			     CLIENT_DATA_REGISTERED) {
   2551		struct ib_client *client = xa_load(&clients, index);
   2552
   2553		if (!client || !client->get_net_dev_by_params)
   2554			continue;
   2555
   2556		net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
   2557							addr, client_data);
   2558		if (net_dev)
   2559			break;
   2560	}
   2561	up_read(&dev->client_data_rwsem);
   2562
   2563	return net_dev;
   2564}
   2565EXPORT_SYMBOL(ib_get_net_dev_by_params);
   2566
   2567void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
   2568{
   2569	struct ib_device_ops *dev_ops = &dev->ops;
   2570#define SET_DEVICE_OP(ptr, name)                                               \
   2571	do {                                                                   \
   2572		if (ops->name)                                                 \
   2573			if (!((ptr)->name))				       \
   2574				(ptr)->name = ops->name;                       \
   2575	} while (0)
   2576
   2577#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
   2578
   2579	if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
   2580		WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
   2581			dev_ops->driver_id != ops->driver_id);
   2582		dev_ops->driver_id = ops->driver_id;
   2583	}
   2584	if (ops->owner) {
   2585		WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
   2586		dev_ops->owner = ops->owner;
   2587	}
   2588	if (ops->uverbs_abi_ver)
   2589		dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
   2590
   2591	dev_ops->uverbs_no_driver_id_binding |=
   2592		ops->uverbs_no_driver_id_binding;
   2593
   2594	SET_DEVICE_OP(dev_ops, add_gid);
   2595	SET_DEVICE_OP(dev_ops, advise_mr);
   2596	SET_DEVICE_OP(dev_ops, alloc_dm);
   2597	SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
   2598	SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
   2599	SET_DEVICE_OP(dev_ops, alloc_mr);
   2600	SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
   2601	SET_DEVICE_OP(dev_ops, alloc_mw);
   2602	SET_DEVICE_OP(dev_ops, alloc_pd);
   2603	SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
   2604	SET_DEVICE_OP(dev_ops, alloc_ucontext);
   2605	SET_DEVICE_OP(dev_ops, alloc_xrcd);
   2606	SET_DEVICE_OP(dev_ops, attach_mcast);
   2607	SET_DEVICE_OP(dev_ops, check_mr_status);
   2608	SET_DEVICE_OP(dev_ops, counter_alloc_stats);
   2609	SET_DEVICE_OP(dev_ops, counter_bind_qp);
   2610	SET_DEVICE_OP(dev_ops, counter_dealloc);
   2611	SET_DEVICE_OP(dev_ops, counter_unbind_qp);
   2612	SET_DEVICE_OP(dev_ops, counter_update_stats);
   2613	SET_DEVICE_OP(dev_ops, create_ah);
   2614	SET_DEVICE_OP(dev_ops, create_counters);
   2615	SET_DEVICE_OP(dev_ops, create_cq);
   2616	SET_DEVICE_OP(dev_ops, create_flow);
   2617	SET_DEVICE_OP(dev_ops, create_qp);
   2618	SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
   2619	SET_DEVICE_OP(dev_ops, create_srq);
   2620	SET_DEVICE_OP(dev_ops, create_user_ah);
   2621	SET_DEVICE_OP(dev_ops, create_wq);
   2622	SET_DEVICE_OP(dev_ops, dealloc_dm);
   2623	SET_DEVICE_OP(dev_ops, dealloc_driver);
   2624	SET_DEVICE_OP(dev_ops, dealloc_mw);
   2625	SET_DEVICE_OP(dev_ops, dealloc_pd);
   2626	SET_DEVICE_OP(dev_ops, dealloc_ucontext);
   2627	SET_DEVICE_OP(dev_ops, dealloc_xrcd);
   2628	SET_DEVICE_OP(dev_ops, del_gid);
   2629	SET_DEVICE_OP(dev_ops, dereg_mr);
   2630	SET_DEVICE_OP(dev_ops, destroy_ah);
   2631	SET_DEVICE_OP(dev_ops, destroy_counters);
   2632	SET_DEVICE_OP(dev_ops, destroy_cq);
   2633	SET_DEVICE_OP(dev_ops, destroy_flow);
   2634	SET_DEVICE_OP(dev_ops, destroy_flow_action);
   2635	SET_DEVICE_OP(dev_ops, destroy_qp);
   2636	SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
   2637	SET_DEVICE_OP(dev_ops, destroy_srq);
   2638	SET_DEVICE_OP(dev_ops, destroy_wq);
   2639	SET_DEVICE_OP(dev_ops, device_group);
   2640	SET_DEVICE_OP(dev_ops, detach_mcast);
   2641	SET_DEVICE_OP(dev_ops, disassociate_ucontext);
   2642	SET_DEVICE_OP(dev_ops, drain_rq);
   2643	SET_DEVICE_OP(dev_ops, drain_sq);
   2644	SET_DEVICE_OP(dev_ops, enable_driver);
   2645	SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry);
   2646	SET_DEVICE_OP(dev_ops, fill_res_cq_entry);
   2647	SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw);
   2648	SET_DEVICE_OP(dev_ops, fill_res_mr_entry);
   2649	SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
   2650	SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
   2651	SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
   2652	SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
   2653	SET_DEVICE_OP(dev_ops, get_dev_fw_str);
   2654	SET_DEVICE_OP(dev_ops, get_dma_mr);
   2655	SET_DEVICE_OP(dev_ops, get_hw_stats);
   2656	SET_DEVICE_OP(dev_ops, get_link_layer);
   2657	SET_DEVICE_OP(dev_ops, get_netdev);
   2658	SET_DEVICE_OP(dev_ops, get_numa_node);
   2659	SET_DEVICE_OP(dev_ops, get_port_immutable);
   2660	SET_DEVICE_OP(dev_ops, get_vector_affinity);
   2661	SET_DEVICE_OP(dev_ops, get_vf_config);
   2662	SET_DEVICE_OP(dev_ops, get_vf_guid);
   2663	SET_DEVICE_OP(dev_ops, get_vf_stats);
   2664	SET_DEVICE_OP(dev_ops, iw_accept);
   2665	SET_DEVICE_OP(dev_ops, iw_add_ref);
   2666	SET_DEVICE_OP(dev_ops, iw_connect);
   2667	SET_DEVICE_OP(dev_ops, iw_create_listen);
   2668	SET_DEVICE_OP(dev_ops, iw_destroy_listen);
   2669	SET_DEVICE_OP(dev_ops, iw_get_qp);
   2670	SET_DEVICE_OP(dev_ops, iw_reject);
   2671	SET_DEVICE_OP(dev_ops, iw_rem_ref);
   2672	SET_DEVICE_OP(dev_ops, map_mr_sg);
   2673	SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
   2674	SET_DEVICE_OP(dev_ops, mmap);
   2675	SET_DEVICE_OP(dev_ops, mmap_free);
   2676	SET_DEVICE_OP(dev_ops, modify_ah);
   2677	SET_DEVICE_OP(dev_ops, modify_cq);
   2678	SET_DEVICE_OP(dev_ops, modify_device);
   2679	SET_DEVICE_OP(dev_ops, modify_hw_stat);
   2680	SET_DEVICE_OP(dev_ops, modify_port);
   2681	SET_DEVICE_OP(dev_ops, modify_qp);
   2682	SET_DEVICE_OP(dev_ops, modify_srq);
   2683	SET_DEVICE_OP(dev_ops, modify_wq);
   2684	SET_DEVICE_OP(dev_ops, peek_cq);
   2685	SET_DEVICE_OP(dev_ops, poll_cq);
   2686	SET_DEVICE_OP(dev_ops, port_groups);
   2687	SET_DEVICE_OP(dev_ops, post_recv);
   2688	SET_DEVICE_OP(dev_ops, post_send);
   2689	SET_DEVICE_OP(dev_ops, post_srq_recv);
   2690	SET_DEVICE_OP(dev_ops, process_mad);
   2691	SET_DEVICE_OP(dev_ops, query_ah);
   2692	SET_DEVICE_OP(dev_ops, query_device);
   2693	SET_DEVICE_OP(dev_ops, query_gid);
   2694	SET_DEVICE_OP(dev_ops, query_pkey);
   2695	SET_DEVICE_OP(dev_ops, query_port);
   2696	SET_DEVICE_OP(dev_ops, query_qp);
   2697	SET_DEVICE_OP(dev_ops, query_srq);
   2698	SET_DEVICE_OP(dev_ops, query_ucontext);
   2699	SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
   2700	SET_DEVICE_OP(dev_ops, read_counters);
   2701	SET_DEVICE_OP(dev_ops, reg_dm_mr);
   2702	SET_DEVICE_OP(dev_ops, reg_user_mr);
   2703	SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf);
   2704	SET_DEVICE_OP(dev_ops, req_notify_cq);
   2705	SET_DEVICE_OP(dev_ops, rereg_user_mr);
   2706	SET_DEVICE_OP(dev_ops, resize_cq);
   2707	SET_DEVICE_OP(dev_ops, set_vf_guid);
   2708	SET_DEVICE_OP(dev_ops, set_vf_link_state);
   2709
   2710	SET_OBJ_SIZE(dev_ops, ib_ah);
   2711	SET_OBJ_SIZE(dev_ops, ib_counters);
   2712	SET_OBJ_SIZE(dev_ops, ib_cq);
   2713	SET_OBJ_SIZE(dev_ops, ib_mw);
   2714	SET_OBJ_SIZE(dev_ops, ib_pd);
   2715	SET_OBJ_SIZE(dev_ops, ib_qp);
   2716	SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
   2717	SET_OBJ_SIZE(dev_ops, ib_srq);
   2718	SET_OBJ_SIZE(dev_ops, ib_ucontext);
   2719	SET_OBJ_SIZE(dev_ops, ib_xrcd);
   2720}
   2721EXPORT_SYMBOL(ib_set_device_ops);
   2722
   2723#ifdef CONFIG_INFINIBAND_VIRT_DMA
   2724int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
   2725{
   2726	struct scatterlist *s;
   2727	int i;
   2728
   2729	for_each_sg(sg, s, nents, i) {
   2730		sg_dma_address(s) = (uintptr_t)sg_virt(s);
   2731		sg_dma_len(s) = s->length;
   2732	}
   2733	return nents;
   2734}
   2735EXPORT_SYMBOL(ib_dma_virt_map_sg);
   2736#endif /* CONFIG_INFINIBAND_VIRT_DMA */
   2737
   2738static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
   2739	[RDMA_NL_LS_OP_RESOLVE] = {
   2740		.doit = ib_nl_handle_resolve_resp,
   2741		.flags = RDMA_NL_ADMIN_PERM,
   2742	},
   2743	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
   2744		.doit = ib_nl_handle_set_timeout,
   2745		.flags = RDMA_NL_ADMIN_PERM,
   2746	},
   2747	[RDMA_NL_LS_OP_IP_RESOLVE] = {
   2748		.doit = ib_nl_handle_ip_res_resp,
   2749		.flags = RDMA_NL_ADMIN_PERM,
   2750	},
   2751};
   2752
   2753static int __init ib_core_init(void)
   2754{
   2755	int ret = -ENOMEM;
   2756
   2757	ib_wq = alloc_workqueue("infiniband", 0, 0);
   2758	if (!ib_wq)
   2759		return -ENOMEM;
   2760
   2761	ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND,
   2762				      WQ_UNBOUND_MAX_ACTIVE);
   2763	if (!ib_unreg_wq)
   2764		goto err;
   2765
   2766	ib_comp_wq = alloc_workqueue("ib-comp-wq",
   2767			WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
   2768	if (!ib_comp_wq)
   2769		goto err_unbound;
   2770
   2771	ib_comp_unbound_wq =
   2772		alloc_workqueue("ib-comp-unb-wq",
   2773				WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
   2774				WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
   2775	if (!ib_comp_unbound_wq)
   2776		goto err_comp;
   2777
   2778	ret = class_register(&ib_class);
   2779	if (ret) {
   2780		pr_warn("Couldn't create InfiniBand device class\n");
   2781		goto err_comp_unbound;
   2782	}
   2783
   2784	rdma_nl_init();
   2785
   2786	ret = addr_init();
   2787	if (ret) {
   2788		pr_warn("Couldn't init IB address resolution\n");
   2789		goto err_ibnl;
   2790	}
   2791
   2792	ret = ib_mad_init();
   2793	if (ret) {
   2794		pr_warn("Couldn't init IB MAD\n");
   2795		goto err_addr;
   2796	}
   2797
   2798	ret = ib_sa_init();
   2799	if (ret) {
   2800		pr_warn("Couldn't init SA\n");
   2801		goto err_mad;
   2802	}
   2803
   2804	ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
   2805	if (ret) {
   2806		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
   2807		goto err_sa;
   2808	}
   2809
   2810	ret = register_pernet_device(&rdma_dev_net_ops);
   2811	if (ret) {
   2812		pr_warn("Couldn't init compat dev. ret %d\n", ret);
   2813		goto err_compat;
   2814	}
   2815
   2816	nldev_init();
   2817	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
   2818	roce_gid_mgmt_init();
   2819
   2820	return 0;
   2821
   2822err_compat:
   2823	unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
   2824err_sa:
   2825	ib_sa_cleanup();
   2826err_mad:
   2827	ib_mad_cleanup();
   2828err_addr:
   2829	addr_cleanup();
   2830err_ibnl:
   2831	class_unregister(&ib_class);
   2832err_comp_unbound:
   2833	destroy_workqueue(ib_comp_unbound_wq);
   2834err_comp:
   2835	destroy_workqueue(ib_comp_wq);
   2836err_unbound:
   2837	destroy_workqueue(ib_unreg_wq);
   2838err:
   2839	destroy_workqueue(ib_wq);
   2840	return ret;
   2841}
   2842
   2843static void __exit ib_core_cleanup(void)
   2844{
   2845	roce_gid_mgmt_cleanup();
   2846	nldev_exit();
   2847	rdma_nl_unregister(RDMA_NL_LS);
   2848	unregister_pernet_device(&rdma_dev_net_ops);
   2849	unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
   2850	ib_sa_cleanup();
   2851	ib_mad_cleanup();
   2852	addr_cleanup();
   2853	rdma_nl_exit();
   2854	class_unregister(&ib_class);
   2855	destroy_workqueue(ib_comp_unbound_wq);
   2856	destroy_workqueue(ib_comp_wq);
   2857	/* Make sure that any pending umem accounting work is done. */
   2858	destroy_workqueue(ib_wq);
   2859	destroy_workqueue(ib_unreg_wq);
   2860	WARN_ON(!xa_empty(&clients));
   2861	WARN_ON(!xa_empty(&devices));
   2862}
   2863
   2864MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
   2865
   2866/* ib core relies on netdev stack to first register net_ns_type_operations
   2867 * ns kobject type before ib_core initialization.
   2868 */
   2869fs_initcall(ib_core_init);
   2870module_exit(ib_core_cleanup);