From: Doug Ledford <dledford@redhat.com> Date: Tue, 14 Apr 2009 15:23:31 -0400 Subject: [openib] core: update core code to OFED 1.4.1-rc3 Message-id: 1239737023-31222-5-git-send-email-dledford@redhat.com O-Subject: [Patch RHEL5.4 04/16] [infiniband/core] Update core code to OFED 1.4.1-rc3 Bugzilla: 476301 Signed-off-by: Doug Ledford <dledford@redhat.com> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f646040..b2e522d 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -30,3 +30,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o + +ib_core-y += namespace.o writeback.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 485f81e..09a2bec 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -4,28 +4,33 @@ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * - * This Software is licensed under one of the following licenses: + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: * - * 1) under the terms of the "Common Public License 1.0" a copy of which is - * available from the Open Source Initiative, see - * http://www.opensource.org/licenses/cpl.php. + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: * - * 2) under the terms of the "The BSD License" a copy of which is - * available from the Open Source Initiative, see - * http://www.opensource.org/licenses/bsd-license.php. + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. * - * 3) under the terms of the "GNU General Public License (GPL) Version 2" a - * copy of which is available from the Open Source Initiative, see - * http://www.opensource.org/licenses/gpl-license.php. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. * - * Licensee has the right to choose one of the above licenses. - * - * Redistributions of source code must retain the above copyright - * notice and one of the license notices. - * - * Redistributions in binary form must reproduce both the above copyright - * notice, one of the license notices in the documentation - * and/or other materials provided with the distribution. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #include <linux/mutex.h> @@ -55,7 +60,7 @@ struct addr_req { int status; }; -static void process_req(void *work); +static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); static LIST_HEAD(req_list); @@ -100,6 +105,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); if (dst_dev_addr) memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); + dev_addr->src_dev = dev; return 0; } EXPORT_SYMBOL(rdma_copy_addr); @@ -110,7 +116,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) __be32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr; int ret; - dev = ip_dev_find(ip); + dev = ip_dev_find(&init_net, ip); if (!dev) return -EADDRNOTAVAIL; @@ -154,11 +160,11 @@ static void addr_send_arp(struct sockaddr_in *dst_in) { struct rtable *rt; struct flowi fl; - u32 dst_ip = dst_in->sin_addr.s_addr; + __be32 dst_ip = dst_in->sin_addr.s_addr; memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = dst_ip; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) return; neigh_event_send(rt->u.dst.neighbour, NULL); @@ -169,8 +175,8 @@ static int addr_resolve_remote(struct sockaddr_in *src_in, struct sockaddr_in *dst_in, struct rdma_dev_addr *addr) { - u32 src_ip = src_in->sin_addr.s_addr; - u32 dst_ip = dst_in->sin_addr.s_addr; + __be32 src_ip = src_in->sin_addr.s_addr; + __be32 dst_ip = dst_in->sin_addr.s_addr; struct flowi fl; struct rtable *rt; struct neighbour *neigh; @@ -179,7 +185,7 @@ static int addr_resolve_remote(struct sockaddr_in *src_in, memset(&fl, 0, sizeof fl); fl.nl_u.ip4_u.daddr = dst_ip; fl.nl_u.ip4_u.saddr = src_ip; - ret = ip_route_output_key(&rt, &fl); + ret = ip_route_output_key(&init_net, &rt, &fl); if (ret) goto out; @@ -214,7 +220,7 @@ out: return ret; } -static void process_req(void *work) +static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; struct sockaddr_in *src_in, *dst_in; @@ -257,19 +263,19 @@ static int addr_resolve_local(struct sockaddr_in *src_in, struct rdma_dev_addr *addr) { struct net_device *dev; - u32 src_ip = src_in->sin_addr.s_addr; + __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; int ret; - dev = ip_dev_find(dst_ip); + dev = ip_dev_find(&init_net, dst_ip); if (!dev) return -EADDRNOTAVAIL; - if (ZERONET(src_ip)) { + if (ipv4_is_zeronet(src_ip)) { src_in->sin_family = dst_in->sin_family; src_in->sin_addr.s_addr = dst_ip; ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (LOOPBACK(src_ip)) { + } else if (ipv4_is_loopback(src_ip)) { ret = rdma_translate_ip((struct sockaddr *)dst_in, addr); if (!ret) memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h index fb9ed14..6669287 100644 --- a/drivers/infiniband/core/agent.h +++ b/drivers/infiniband/core/agent.h @@ -32,8 +32,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: agent.h 1389 2004-12-27 22:56:47Z roland $ */ #ifndef __AGENT_H_ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index e85f701..6888356 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -31,8 +31,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $ */ #include <linux/module.h> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index dc3bc17..73774b4 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2006 Intel Corporation. All rights reserved. + * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. @@ -31,19 +31,20 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: cm.c 4311 2005-12-05 18:42:01Z sean.hefty $ */ #include <linux/completion.h> #include <linux/dma-mapping.h> +#include <linux/device.h> #include <linux/err.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/sysfs.h> #include <linux/workqueue.h> +#include <linux/kdev_t.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> @@ -89,17 +90,94 @@ static struct ib_cm { struct workqueue_struct *wq; } cm; +/* Counter indexes ordered by attribute ID */ +enum { + CM_REQ_COUNTER, + CM_MRA_COUNTER, + CM_REJ_COUNTER, + CM_REP_COUNTER, + CM_RTU_COUNTER, + CM_DREQ_COUNTER, + CM_DREP_COUNTER, + CM_SIDR_REQ_COUNTER, + CM_SIDR_REP_COUNTER, + CM_LAP_COUNTER, + CM_APR_COUNTER, + CM_ATTR_COUNT, + CM_ATTR_ID_OFFSET = 0x0010, +}; + +enum { + CM_XMIT, + CM_XMIT_RETRIES, + CM_RECV, + CM_RECV_DUPLICATES, + CM_COUNTER_GROUPS +}; + +static char const counter_group_names[CM_COUNTER_GROUPS] + [sizeof("cm_rx_duplicates")] = { + "cm_tx_msgs", "cm_tx_retries", + "cm_rx_msgs", "cm_rx_duplicates" +}; + +struct cm_counter_group { + struct kobject obj; + atomic_long_t counter[CM_ATTR_COUNT]; +}; + +struct cm_counter_attribute { + struct attribute attr; + int index; +}; + +#define CM_COUNTER_ATTR(_name, _index) \ +struct cm_counter_attribute cm_##_name##_counter_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE }, \ + .index = _index \ +} + +static CM_COUNTER_ATTR(req, CM_REQ_COUNTER); +static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER); +static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER); +static CM_COUNTER_ATTR(rep, CM_REP_COUNTER); +static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER); +static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER); +static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER); +static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER); +static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER); +static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER); +static CM_COUNTER_ATTR(apr, CM_APR_COUNTER); + +static struct attribute *cm_counter_default_attrs[] = { + &cm_req_counter_attr.attr, + &cm_mra_counter_attr.attr, + &cm_rej_counter_attr.attr, + &cm_rep_counter_attr.attr, + &cm_rtu_counter_attr.attr, + &cm_dreq_counter_attr.attr, + &cm_drep_counter_attr.attr, + &cm_sidr_req_counter_attr.attr, + &cm_sidr_rep_counter_attr.attr, + &cm_lap_counter_attr.attr, + &cm_apr_counter_attr.attr, + NULL +}; + struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; + struct kobject port_obj; u8 port_num; + struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; struct cm_device { struct list_head list; - struct ib_device *device; + struct ib_device *ib_device; + struct device *device; u8 ack_delay; - struct cm_port port[0]; + struct cm_port *port[0]; }; struct cm_av { @@ -273,7 +351,7 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, { av->port = port; av->pkey_index = wc->pkey_index; - ib_init_ah_from_wc(port->cm_dev->device, port->port_num, wc, + ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, grh, &av->ah_attr); } @@ -287,9 +365,9 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { - if (!ib_find_cached_gid(cm_dev->device, &path->sgid, + if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, &p, NULL)) { - port = &cm_dev->port[p-1]; + port = cm_dev->port[p-1]; break; } } @@ -298,13 +376,13 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) if (!port) return -EINVAL; - ret = ib_find_cached_pkey(cm_dev->device, port->port_num, + ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, be16_to_cpu(path->pkey), &av->pkey_index); if (ret) return ret; av->port = port; - ib_init_ah_from_path(cm_dev->device, port->port_num, path, + ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); av->timeout = path->packet_life_time + 1; return 0; @@ -325,7 +403,7 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) spin_unlock_irqrestore(&cm.lock, flags); } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); - cm_id_priv->id.local_id = (__force __be32) (id ^ cm.random_id_operand); + cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; return ret; } @@ -399,6 +477,31 @@ static int cm_compare_private_data(u8 *private_data, return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE); } +/* + * Trivial helpers to strip endian annotation and compare; the + * endianness doesn't actually matter since we just need a stable + * order for the RB tree. + */ +static int be32_lt(__be32 a, __be32 b) +{ + return (__force u32) a < (__force u32) b; +} + +static int be32_gt(__be32 a, __be32 b) +{ + return (__force u32) a > (__force u32) b; +} + +static int be64_lt(__be64 a, __be64 b) +{ + return (__force u64) a < (__force u64) b; +} + +static int be64_gt(__be64 a, __be64 b) +{ + return (__force u64) a > (__force u64) b; +} + static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.listen_service_table.rb_node; @@ -424,9 +527,9 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) link = &(*link)->rb_left; else if (cm_id_priv->id.device > cur_cm_id_priv->id.device) link = &(*link)->rb_right; - else if (service_id < cur_cm_id_priv->id.service_id) + else if (be64_lt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_left; - else if (service_id > cur_cm_id_priv->id.service_id) + else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; else if (data_cmp < 0) link = &(*link)->rb_left; @@ -459,9 +562,9 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device, node = node->rb_left; else if (device > cm_id_priv->id.device) node = node->rb_right; - else if (service_id < cm_id_priv->id.service_id) + else if (be64_lt(service_id, cm_id_priv->id.service_id)) node = node->rb_left; - else if (service_id > cm_id_priv->id.service_id) + else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; else if (data_cmp < 0) node = node->rb_left; @@ -484,13 +587,13 @@ static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_id_node); - if (remote_id < cur_timewait_info->work.remote_id) + if (be32_lt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_left; - else if (remote_id > cur_timewait_info->work.remote_id) + else if (be32_gt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_right; - else if (remote_ca_guid < cur_timewait_info->remote_ca_guid) + else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; - else if (remote_ca_guid > cur_timewait_info->remote_ca_guid) + else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; @@ -510,13 +613,13 @@ static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); - if (remote_id < timewait_info->work.remote_id) + if (be32_lt(remote_id, timewait_info->work.remote_id)) node = node->rb_left; - else if (remote_id > timewait_info->work.remote_id) + else if (be32_gt(remote_id, timewait_info->work.remote_id)) node = node->rb_right; - else if (remote_ca_guid < timewait_info->remote_ca_guid) + else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_left; - else if (remote_ca_guid > timewait_info->remote_ca_guid) + else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; else return timewait_info; @@ -537,13 +640,13 @@ static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_qp_node); - if (remote_qpn < cur_timewait_info->remote_qpn) + if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_left; - else if (remote_qpn > cur_timewait_info->remote_qpn) + else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_right; - else if (remote_ca_guid < cur_timewait_info->remote_ca_guid) + else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; - else if (remote_ca_guid > cur_timewait_info->remote_ca_guid) + else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; @@ -567,9 +670,9 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, sidr_id_node); - if (remote_id < cur_cm_id_priv->id.remote_id) + if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_left; - else if (remote_id > cur_cm_id_priv->id.remote_id) + else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_right; else { int cmp; @@ -906,6 +1009,9 @@ static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) { + struct ib_sa_path_rec *pri_path = param->primary_path; + struct ib_sa_path_rec *alt_path = param->alternate_path; + cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -941,35 +1047,46 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); cm_req_set_srq(req_msg, param->srq); - req_msg->primary_local_lid = param->primary_path->slid; - req_msg->primary_remote_lid = param->primary_path->dlid; - req_msg->primary_local_gid = param->primary_path->sgid; - req_msg->primary_remote_gid = param->primary_path->dgid; - cm_req_set_primary_flow_label(req_msg, param->primary_path->flow_label); - cm_req_set_primary_packet_rate(req_msg, param->primary_path->rate); - req_msg->primary_traffic_class = param->primary_path->traffic_class; - req_msg->primary_hop_limit = param->primary_path->hop_limit; - cm_req_set_primary_sl(req_msg, param->primary_path->sl); - cm_req_set_primary_subnet_local(req_msg, 1); /* local only... */ + if (pri_path->hop_limit <= 1) { + req_msg->primary_local_lid = pri_path->slid; + req_msg->primary_remote_lid = pri_path->dlid; + } else { + /* Work-around until there's a way to obtain remote LID info */ + req_msg->primary_local_lid = IB_LID_PERMISSIVE; + req_msg->primary_remote_lid = IB_LID_PERMISSIVE; + } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); + cm_req_set_primary_packet_rate(req_msg, pri_path->rate); + req_msg->primary_traffic_class = pri_path->traffic_class; + req_msg->primary_hop_limit = pri_path->hop_limit; + cm_req_set_primary_sl(req_msg, pri_path->sl); + cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1)); cm_req_set_primary_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, - param->primary_path->packet_life_time)); + pri_path->packet_life_time)); - if (param->alternate_path) { - req_msg->alt_local_lid = param->alternate_path->slid; - req_msg->alt_remote_lid = param->alternate_path->dlid; - req_msg->alt_local_gid = param->alternate_path->sgid; - req_msg->alt_remote_gid = param->alternate_path->dgid; + if (alt_path) { + if (alt_path->hop_limit <= 1) { + req_msg->alt_local_lid = alt_path->slid; + req_msg->alt_remote_lid = alt_path->dlid; + } else { + req_msg->alt_local_lid = IB_LID_PERMISSIVE; + req_msg->alt_remote_lid = IB_LID_PERMISSIVE; + } + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, - param->alternate_path->flow_label); - cm_req_set_alt_packet_rate(req_msg, param->alternate_path->rate); - req_msg->alt_traffic_class = param->alternate_path->traffic_class; - req_msg->alt_hop_limit = param->alternate_path->hop_limit; - cm_req_set_alt_sl(req_msg, param->alternate_path->sl); - cm_req_set_alt_subnet_local(req_msg, 1); /* local only... */ + alt_path->flow_label); + cm_req_set_alt_packet_rate(req_msg, alt_path->rate); + req_msg->alt_traffic_class = alt_path->traffic_class; + req_msg->alt_hop_limit = alt_path->hop_limit; + cm_req_set_alt_sl(req_msg, alt_path->sl); + cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1)); cm_req_set_alt_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, - param->alternate_path->packet_life_time)); + alt_path->packet_life_time)); } if (param->private_data && param->private_data_len) @@ -1298,6 +1415,9 @@ static void cm_dup_req_handler(struct cm_work *work, struct ib_mad_send_buf *msg = NULL; int ret; + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_REQ_COUNTER]); + /* Quick state check to discard duplicate REQs. */ if (cm_id_priv->id.state == IB_CM_REQ_RCVD) return; @@ -1387,6 +1507,34 @@ out: return listen_cm_id_priv; } +/* + * Work-around for inter-subnet connections. If the LIDs are permissive, + * we need to override the LID/SL data in the REQ with the LID information + * in the work completion. + */ +static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) +{ + if (!cm_req_get_primary_subnet_local(req_msg)) { + if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { + req_msg->primary_local_lid = cpu_to_be16(wc->slid); + cm_req_set_primary_sl(req_msg, wc->sl); + } + + if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE) + req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits); + } + + if (!cm_req_get_alt_subnet_local(req_msg)) { + if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { + req_msg->alt_local_lid = cpu_to_be16(wc->slid); + cm_req_set_alt_sl(req_msg, wc->sl); + } + + if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE) + req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits); + } +} + static int cm_req_handler(struct cm_work *work) { struct ib_cm_id *cm_id; @@ -1396,7 +1544,7 @@ static int cm_req_handler(struct cm_work *work) req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id = ib_create_cm_id(work->port->cm_dev->device, NULL, NULL); + cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); @@ -1427,10 +1575,11 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->id.service_id = req_msg->service_id; cm_id_priv->id.service_mask = __constant_cpu_to_be64(~0ULL); + cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); if (ret) { - ib_get_cached_gid(work->port->cm_dev->device, + ib_get_cached_gid(work->port->cm_dev->ib_device, work->port->port_num, 0, &work->path[0].sgid); ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, @@ -1651,6 +1800,8 @@ static void cm_dup_rep_handler(struct cm_work *work) if (!cm_id_priv) return; + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_REP_COUNTER]); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) goto deref; @@ -1816,6 +1967,8 @@ static int cm_rtu_handler(struct cm_work *work) if (cm_id_priv->id.state != IB_CM_REP_SENT && cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { spin_unlock_irq(&cm_id_priv->lock); + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_RTU_COUNTER]); goto out; } cm_id_priv->id.state = IB_CM_ESTABLISHED; @@ -1993,6 +2146,8 @@ static int cm_dreq_handler(struct cm_work *work) cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id, dreq_msg->local_comm_id); if (!cm_id_priv) { + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); return -EINVAL; } @@ -2012,6 +2167,8 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_MRA_REP_RCVD: break; case IB_CM_TIMEWAIT: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) goto unlock; @@ -2023,6 +2180,10 @@ static int cm_dreq_handler(struct cm_work *work) if (ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; + case IB_CM_DREQ_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + goto unlock; default: goto unlock; } @@ -2380,10 +2541,20 @@ static int cm_mra_handler(struct cm_work *work) if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER || cm_id_priv->id.lap_state != IB_CM_LAP_SENT || ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) + cm_id_priv->msg, timeout)) { + if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + atomic_long_inc(&work->port-> + counter_group[CM_RECV_DUPLICATES]. + counter[CM_MRA_COUNTER]); goto out; + } cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; break; + case IB_CM_MRA_REQ_RCVD: + case IB_CM_MRA_REP_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_MRA_COUNTER]); + /* fall through */ default: goto out; } @@ -2543,6 +2714,8 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_LAP_IDLE: break; case IB_CM_MRA_LAP_SENT: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_LAP_COUNTER]); if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) goto unlock; @@ -2556,6 +2729,10 @@ static int cm_lap_handler(struct cm_work *work) if (ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; + case IB_CM_LAP_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_LAP_COUNTER]); + goto unlock; default: goto unlock; } @@ -2736,7 +2913,7 @@ static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR)); sidr_req_msg->request_id = cm_id_priv->id.local_id; - sidr_req_msg->pkey = cpu_to_be16(param->path->pkey); + sidr_req_msg->pkey = param->path->pkey; sidr_req_msg->service_id = param->service_id; if (param->private_data && param->private_data_len) @@ -2821,7 +2998,7 @@ static int cm_sidr_req_handler(struct cm_work *work) struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; - cm_id = ib_create_cm_id(work->port->cm_dev->device, NULL, NULL); + cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -2843,6 +3020,8 @@ static int cm_sidr_req_handler(struct cm_work *work) cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); if (cur_cm_id_priv) { spin_unlock_irq(&cm.lock); + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_SIDR_REQ_COUNTER]); goto out; /* Duplicate message. */ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; @@ -3037,6 +3216,27 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; + struct cm_port *port; + u16 attr_index; + + port = mad_agent->context; + attr_index = be16_to_cpu(((struct ib_mad_hdr *) + msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; + + /* + * If the send was in response to a received message (context[0] is not + * set to a cm_id), and is not a REJ, then it is a send that was + * manually retried. + */ + if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) + msg->retries = 1; + + atomic_long_add(1 + msg->retries, + &port->counter_group[CM_XMIT].counter[attr_index]); + if (msg->retries) + atomic_long_add(msg->retries, + &port->counter_group[CM_XMIT_RETRIES]. + counter[attr_index]); switch (mad_send_wc->status) { case IB_WC_SUCCESS: @@ -3195,8 +3395,10 @@ EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_recv_wc *mad_recv_wc) { + struct cm_port *port = mad_agent->context; struct cm_work *work; enum ib_cm_event_type event; + u16 attr_id; int paths = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { @@ -3241,6 +3443,10 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, return; } + attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); + atomic_long_inc(&port->counter_group[CM_RECV]. + counter[attr_id - CM_ATTR_ID_OFFSET]); + work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths, GFP_KERNEL); if (!work) { @@ -3251,7 +3457,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, INIT_DELAYED_WORK(&work->work, cm_work_handler); work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; - work->port = (struct cm_port *)mad_agent->context; + work->port = port; queue_delayed_work(cm.wq, &work->work, 0); } @@ -3420,13 +3626,93 @@ static void cm_get_ack_delay(struct cm_device *cm_dev) { struct ib_device_attr attr; - if (ib_query_device(cm_dev->device, &attr)) + if (ib_query_device(cm_dev->ib_device, &attr)) cm_dev->ack_delay = 0; /* acks will rely on packet life time */ else cm_dev->ack_delay = attr.local_ca_ack_delay; } -static void cm_add_one(struct ib_device *device) +static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, + char *buf) +{ + struct cm_counter_group *group; + struct cm_counter_attribute *cm_attr; + + group = container_of(obj, struct cm_counter_group, obj); + cm_attr = container_of(attr, struct cm_counter_attribute, attr); + + return sprintf(buf, "%ld\n", + atomic_long_read(&group->counter[cm_attr->index])); +} + +static struct sysfs_ops cm_counter_ops = { + .show = cm_show_counter +}; + +static struct kobj_type cm_counter_obj_type = { + .sysfs_ops = &cm_counter_ops, + .default_attrs = cm_counter_default_attrs +}; + +static void cm_release_port_obj(struct kobject *obj) +{ + struct cm_port *cm_port; + + cm_port = container_of(obj, struct cm_port, port_obj); + kfree(cm_port); +} + +static struct kobj_type cm_port_obj_type = { + .release = cm_release_port_obj +}; + +struct class cm_class = { + .name = "infiniband_cm", +}; +EXPORT_SYMBOL(cm_class); + +static int cm_create_port_fs(struct cm_port *port) +{ + int i, ret; + + ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, + &port->cm_dev->device->kobj, + "%d", port->port_num); + if (ret) { + kfree(port); + return ret; + } + + for (i = 0; i < CM_COUNTER_GROUPS; i++) { + ret = kobject_init_and_add(&port->counter_group[i].obj, + &cm_counter_obj_type, + &port->port_obj, + "%s", counter_group_names[i]); + if (ret) + goto error; + } + + return 0; + +error: + while (i--) + kobject_unregister(&port->counter_group[i].obj); + kobject_unregister(&port->port_obj); + return ret; + +} + +static void cm_remove_port_fs(struct cm_port *port) +{ + int i; + + for (i = 0; i < CM_COUNTER_GROUPS; i++) + kobject_unregister(&port->counter_group[i].obj); + + kobject_unregister(&port->port_obj); +} + +static void cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; @@ -3441,23 +3727,40 @@ static void cm_add_one(struct ib_device *device) int ret; u8 i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB) return; - cm_dev = kmalloc(sizeof(*cm_dev) + sizeof(*port) * - device->phys_port_cnt, GFP_KERNEL); + cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * + ib_device->phys_port_cnt, GFP_KERNEL); if (!cm_dev) return; - cm_dev->device = device; + cm_dev->ib_device = ib_device; cm_get_ack_delay(cm_dev); + cm_dev->device = device_create(&cm_class, ib_device->class_dev.dev, + MKDEV(0, 0), + "%s", ib_device->name); + if (!cm_dev->device) { + kfree(cm_dev); + return; + } + set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); - for (i = 1; i <= device->phys_port_cnt; i++) { - port = &cm_dev->port[i-1]; + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + port = kzalloc(sizeof *port, GFP_KERNEL); + if (!port) + goto error1; + + cm_dev->port[i-1] = port; port->cm_dev = cm_dev; port->port_num = i; - port->mad_agent = ib_register_mad_agent(device, i, + + ret = cm_create_port_fs(port); + if (ret) + goto error1; + + port->mad_agent = ib_register_mad_agent(ib_device, i, IB_QPT_GSI, ®_req, 0, @@ -3465,33 +3768,37 @@ static void cm_add_one(struct ib_device *device) cm_recv_handler, port); if (IS_ERR(port->mad_agent)) - goto error1; + goto error2; - ret = ib_modify_port(device, i, 0, &port_modify); + ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) - goto error2; + goto error3; } - ib_set_client_data(device, &cm_client, cm_dev); + ib_set_client_data(ib_device, &cm_client, cm_dev); write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); return; -error2: +error3: ib_unregister_mad_agent(port->mad_agent); +error2: + cm_remove_port_fs(port); error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { - port = &cm_dev->port[i-1]; - ib_modify_port(device, port->port_num, 0, &port_modify); + port = cm_dev->port[i-1]; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); + cm_remove_port_fs(port); } + device_unregister(cm_dev->device); kfree(cm_dev); } -static void cm_remove_one(struct ib_device *device) +static void cm_remove_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; @@ -3501,7 +3808,7 @@ static void cm_remove_one(struct ib_device *device) unsigned long flags; int i; - cm_dev = ib_get_client_data(device, &cm_client); + cm_dev = ib_get_client_data(ib_device, &cm_client); if (!cm_dev) return; @@ -3509,12 +3816,14 @@ static void cm_remove_one(struct ib_device *device) list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); - for (i = 1; i <= device->phys_port_cnt; i++) { - port = &cm_dev->port[i-1]; - ib_modify_port(device, port->port_num, 0, &port_modify); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + port = cm_dev->port[i-1]; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); flush_workqueue(cm.wq); + cm_remove_port_fs(port); } + device_unregister(cm_dev->device); kfree(cm_dev); } @@ -3536,17 +3845,25 @@ static int __init ib_cm_init(void) idr_pre_get(&cm.local_id_table, GFP_KERNEL); INIT_LIST_HEAD(&cm.timewait_list); - cm.wq = create_workqueue("ib_cm"); - if (!cm.wq) + ret = class_register(&cm_class); + if (ret) return -ENOMEM; + cm.wq = create_workqueue("ib_cm"); + if (!cm.wq) { + ret = -ENOMEM; + goto error1; + } + ret = ib_register_client(&cm_client); if (ret) - goto error; + goto error2; return 0; -error: +error2: destroy_workqueue(cm.wq); +error1: + class_unregister(&cm_class); return ret; } @@ -3567,6 +3884,7 @@ static void __exit ib_cm_cleanup(void) kfree(timewait_info); } + class_unregister(&cm_class); idr_destroy(&cm.local_id_table); } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 19f75d2..faef87b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4,29 +4,33 @@ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * - * This Software is licensed under one of the following licenses: + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: * - * 1) under the terms of the "Common Public License 1.0" a copy of which is - * available from the Open Source Initiative, see - * http://www.opensource.org/licenses/cpl.php. + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: * - * 2) under the terms of the "The BSD License" a copy of which is - * available from the Open Source Initiative, see - * http://www.opensource.org/licenses/bsd-license.php. + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. * - * 3) under the terms of the "GNU General Public License (GPL) Version 2" a - * copy of which is available from the Open Source Initiative, see - * http://www.opensource.org/licenses/gpl-license.php. - * - * Licensee has the right to choose one of the above licenses. - * - * Redistributions of source code must retain the above copyright - * notice and one of the license notices. - * - * Redistributions in binary form must reproduce both the above copyright - * notice, one of the license notices in the documentation - * and/or other materials provided with the distribution. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #include <linux/completion.h> @@ -54,6 +58,11 @@ static int tavor_quirk = 0; module_param_named(tavor_quirk, tavor_quirk, int, 0644); MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0"); +int unify_tcp_port_space = 0; +module_param(unify_tcp_port_space, int, 0644); +MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " + "space allocation (default=0)"); + #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) @@ -121,6 +130,7 @@ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; + struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ @@ -134,8 +144,7 @@ struct rdma_id_private { struct completion comp; atomic_t refcount; - wait_queue_head_t wait_remove; - atomic_t dev_remove; + struct mutex handler_mutex; int backlog; int timeout_ms; @@ -160,9 +169,7 @@ struct cma_multicast { } multicast; struct list_head list; void *context; - struct sockaddr addr; - u8 pad[sizeof(struct sockaddr_in6) - - sizeof(struct sockaddr)]; + struct sockaddr_storage addr; }; struct cma_work { @@ -173,18 +180,24 @@ struct cma_work { struct rdma_cm_event event; }; +struct cma_ndev_work { + struct work_struct work; + struct rdma_id_private *id; + struct rdma_cm_event event; +}; + union cma_ip_addr { struct in6_addr ip6; struct { - __u32 pad[3]; - __u32 addr; + __be32 pad[3]; + __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ - __u16 port; + __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; @@ -194,8 +207,8 @@ struct sdp_hh { u8 sdp_version; /* Major version: 7:4 */ u8 ip_version; /* IP version: 7:4 */ u8 sdp_specific1[10]; - __u16 port; - __u16 sdp_specific2; + __be16 port; + __be16 sdp_specific2; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; @@ -297,21 +310,25 @@ static void cma_detach_from_dev(struct rdma_id_private *id_priv) id_priv->cma_dev = NULL; } -static int cma_set_qkey(struct ib_device *device, u8 port_num, - enum rdma_port_space ps, - struct rdma_dev_addr *dev_addr, u32 *qkey) +static int cma_set_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; - switch (ps) { + if (id_priv->qkey) + return 0; + + switch (id_priv->id.ps) { case RDMA_PS_UDP: - *qkey = RDMA_UDP_QKEY; + id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: - ib_addr_get_mgid(dev_addr, &rec.mgid); - ret = ib_sa_get_mcmember_rec(device, port_num, &rec.mgid, &rec); - *qkey = be32_to_cpu(rec.qkey); + ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, + id_priv->id.port_num, &rec.mgid, + &rec); + if (!ret) + id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; @@ -341,12 +358,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) ret = ib_find_cached_gid(cma_dev->device, &gid, &id_priv->id.port_num, NULL); if (!ret) { - ret = cma_set_qkey(cma_dev->device, - id_priv->id.port_num, - id_priv->id.ps, dev_addr, - &id_priv->qkey); - if (!ret) - cma_attach_to_dev(id_priv, cma_dev); + cma_attach_to_dev(id_priv, cma_dev); break; } } @@ -359,26 +371,15 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -static int cma_disable_remove(struct rdma_id_private *id_priv, +static int cma_disable_callback(struct rdma_id_private *id_priv, enum cma_state state) { - unsigned long flags; - int ret; - - spin_lock_irqsave(&id_priv->lock, flags); - if (id_priv->state == state) { - atomic_inc(&id_priv->dev_remove); - ret = 0; - } else - ret = -EINVAL; - spin_unlock_irqrestore(&id_priv->lock, flags); - return ret; -} - -static void cma_enable_remove(struct rdma_id_private *id_priv) -{ - if (atomic_dec_and_test(&id_priv->dev_remove)) - wake_up(&id_priv->wait_remove); + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != state) { + mutex_unlock(&id_priv->handler_mutex); + return -EINVAL; + } + return 0; } static int cma_has_cm_dev(struct rdma_id_private *id_priv) @@ -403,8 +404,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); - init_waitqueue_head(&id_priv->wait_remove); - atomic_set(&id_priv->dev_remove, 0); + mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); @@ -496,7 +496,8 @@ void rdma_destroy_qp(struct rdma_cm_id *id) } EXPORT_SYMBOL(rdma_destroy_qp); -static int cma_modify_qp_rtr(struct rdma_id_private *id_priv) +static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; @@ -522,13 +523,16 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv) if (ret) goto out; + if (conn_param) + qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } -static int cma_modify_qp_rts(struct rdma_id_private *id_priv) +static int cma_modify_qp_rts(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; @@ -544,6 +548,8 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv) if (ret) goto out; + if (conn_param) + qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); @@ -584,6 +590,10 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (cma_is_ud_ps(id_priv->id.ps)) { + ret = cma_set_qkey(id_priv); + if (ret) + return ret; + qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { @@ -632,7 +642,8 @@ static inline int cma_zero_addr(struct sockaddr *addr) struct in6_addr *ip6; if (addr->sa_family == AF_INET) - return ZERONET(((struct sockaddr_in *) addr)->sin_addr.s_addr); + return ipv4_is_zeronet( + ((struct sockaddr_in *)addr)->sin_addr.s_addr); else { ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | @@ -642,7 +653,7 @@ static inline int cma_zero_addr(struct sockaddr *addr) static inline int cma_loopback_addr(struct sockaddr *addr) { - return LOOPBACK(((struct sockaddr_in *) addr)->sin_addr.s_addr); + return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); } static inline int cma_any_addr(struct sockaddr *addr) @@ -664,7 +675,7 @@ static inline int cma_any_port(struct sockaddr *addr) } static int cma_get_net_info(void *hdr, enum rdma_port_space ps, - u8 *ip_ver, __u16 *port, + u8 *ip_ver, __be16 *port, union cma_ip_addr **src, union cma_ip_addr **dst) { switch (ps) { @@ -696,7 +707,7 @@ static int cma_get_net_info(void *hdr, enum rdma_port_space ps, static void cma_save_net_info(struct rdma_addr *addr, struct rdma_addr *listen_addr, - u8 ip_ver, __u16 port, + u8 ip_ver, __be16 port, union cma_ip_addr *src, union cma_ip_addr *dst) { struct sockaddr_in *listen4, *ip4; @@ -790,8 +801,8 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, cma_cancel_route(id_priv); break; case CMA_LISTEN: - if (cma_any_addr(&id_priv->id.route.addr.src_addr) && - !id_priv->cma_dev) + if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) + && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: @@ -813,6 +824,8 @@ static void cma_release_port(struct rdma_id_private *id_priv) kfree(bind_list); } mutex_unlock(&lock); + if (id_priv->sock) + sock_release(id_priv->sock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -874,11 +887,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; - ret = cma_modify_qp_rtr(id_priv); + ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; - ret = cma_modify_qp_rts(id_priv); + ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; @@ -924,7 +937,10 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) struct rdma_cm_event event; int ret = 0; - if (cma_disable_remove(id_priv, CMA_CONNECT)) + if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && + cma_disable_callback(id_priv, CMA_CONNECT)) || + (ib_event->event == IB_CM_TIMEWAIT_EXIT && + cma_disable_callback(id_priv, CMA_DISCONNECT))) return 0; memset(&event, 0, sizeof event); @@ -960,6 +976,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: + event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; + break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; @@ -971,7 +989,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d", + printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -981,12 +999,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); return ret; } @@ -997,8 +1015,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct rdma_cm_id *id; struct rdma_route *rt; union cma_ip_addr *src, *dst; - __u16 port; + __be16 port; u8 ip_ver; + int ret; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) @@ -1023,10 +1042,11 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); - rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA; + ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, + &id->route.addr.dev_addr); + if (ret) + goto destroy_id; id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; @@ -1044,7 +1064,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct rdma_id_private *id_priv; struct rdma_cm_id *id; union cma_ip_addr *src, *dst; - __u16 port; + __be16 port; u8 ip_ver; int ret; @@ -1061,7 +1081,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); - ret = rdma_translate_ip(&id->route.addr.src_addr, + ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, &id->route.addr.dev_addr); if (ret) goto err; @@ -1096,7 +1116,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int offset, ret; listen_id = cm_id->context; - if (cma_disable_remove(listen_id, CMA_LISTEN)) + if (cma_disable_callback(listen_id, CMA_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); @@ -1117,7 +1137,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) goto out; } - atomic_inc(&conn_id->dev_remove); + mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); mutex_lock(&lock); ret = cma_acquire_dev(conn_id); mutex_unlock(&lock); @@ -1139,7 +1159,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) !cma_is_ud_ps(conn_id->id.ps)) ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); mutex_unlock(&lock); - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); goto out; } @@ -1148,11 +1168,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) release_conn_id: cma_exch(conn_id, CMA_DESTROYING); - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(&conn_id->id); out: - cma_enable_remove(listen_id); + mutex_unlock(&listen_id->handler_mutex); return ret; } @@ -1166,7 +1186,7 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, { struct cma_hdr *cma_data, *cma_mask; struct sdp_hh *sdp_data, *sdp_mask; - __u32 ip4_addr; + __be32 ip4_addr; struct in6_addr ip6_addr; memset(compare, 0, sizeof *compare); @@ -1182,12 +1202,12 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, sdp_set_ip_ver(sdp_data, 4); sdp_set_ip_ver(sdp_mask, 0xF); sdp_data->dst_addr.ip4.addr = ip4_addr; - sdp_mask->dst_addr.ip4.addr = ~0; + sdp_mask->dst_addr.ip4.addr = htonl(~0); } else { cma_set_ip_ver(cma_data, 4); cma_set_ip_ver(cma_mask, 0xF); cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = ~0; + cma_mask->dst_addr.ip4.addr = htonl(~0); } break; case AF_INET6: @@ -1218,7 +1238,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr_in *sin; int ret = 0; - if (cma_disable_remove(id_priv, CMA_CONNECT)) + if (cma_disable_callback(id_priv, CMA_CONNECT)) return 0; memset(&event, 0, sizeof event); @@ -1262,12 +1282,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); return ret; } @@ -1283,31 +1303,31 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct ib_device_attr attr; listen_id = cm_id->context; - if (cma_disable_remove(listen_id, CMA_LISTEN)) + if (cma_disable_callback(listen_id, CMA_LISTEN)) return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP); - if (!new_cm_id) { + if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); - atomic_inc(&conn_id->dev_remove); + mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = CMA_CONNECT; - dev = ip_dev_find(iw_event->local_addr.sin_addr.s_addr); + dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); if (ret) { - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } @@ -1316,7 +1336,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, ret = cma_acquire_dev(conn_id); mutex_unlock(&lock); if (ret) { - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } @@ -1332,30 +1352,33 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, ret = ib_query_device(conn_id->id.device, &attr); if (ret) { - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } - + memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; - event.param.conn.initiator_depth = attr.max_qp_init_rd_atom; + event.param.conn.initiator_depth = attr.max_qp_init_rd_atom; event.param.conn.responder_resources = attr.max_qp_rd_atom; ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, CMA_DESTROYING); - cma_enable_remove(conn_id); + mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(&conn_id->id); + goto out; } + mutex_unlock(&conn_id->handler_mutex); + out: if (dev) dev_put(dev); - cma_enable_remove(listen_id); + mutex_unlock(&listen_id->handler_mutex); return ret; } @@ -1371,7 +1394,7 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) if (IS_ERR(id_priv->cm_id.ib)) return PTR_ERR(id_priv->cm_id.ib); - addr = &id_priv->id.route.addr.src_addr; + addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); if (cma_any_addr(addr)) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); @@ -1437,7 +1460,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, dev_id_priv->state = CMA_ADDR_BOUND; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, - ip_addr_size(&id_priv->id.route.addr.src_addr)); + ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); @@ -1447,7 +1470,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " - "listening on device %s", ret, cma_dev->device->name); + "listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -1557,13 +1580,14 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; - path_rec.service_id = cma_get_service_id(id_priv->id.ps, &addr->dst_addr); + path_rec.service_id = cma_get_service_id(id_priv->id.ps, + (struct sockaddr *) &addr->dst_addr); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; - if (addr->src_addr.sa_family == AF_INET) { + if (addr->src_addr.ss_family == AF_INET) { path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; } else { @@ -1592,7 +1616,7 @@ static void cma_work_handler(struct work_struct *_work) struct rdma_id_private *id_priv = work->id; int destroy = 0; - atomic_inc(&id_priv->dev_remove); + mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out; @@ -1601,7 +1625,31 @@ static void cma_work_handler(struct work_struct *_work) destroy = 1; } out: - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + if (destroy) + rdma_destroy_id(&id_priv->id); + kfree(work); +} + +static void cma_ndev_work_handler(struct work_struct *_work) +{ + struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct rdma_id_private *id_priv = work->id; + int destroy = 0; + + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state == CMA_DESTROYING || + id_priv->state == CMA_DEVICE_REMOVAL) + goto out; + + if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + cma_exch(id_priv, CMA_DESTROYING); + destroy = 1; + } + +out: + mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); @@ -1764,7 +1812,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_cm_event event; memset(&event, 0, sizeof event); - atomic_inc(&id_priv->dev_remove); + mutex_lock(&id_priv->handler_mutex); /* * Grab mutex to block rdma_destroy_id() from removing the device while @@ -1793,13 +1841,13 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, CMA_DESTROYING); - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); } @@ -1823,7 +1871,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - if (cma_zero_addr(&id_priv->id.route.addr.src_addr)) { + if (cma_zero_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) { src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr; dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr; src_in->sin_family = dst_in->sin_family; @@ -1872,7 +1920,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (cma_any_addr(dst_addr)) ret = cma_resolve_loopback(id_priv); else - ret = rdma_resolve_ip(&addr_client, &id->route.addr.src_addr, + ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); if (ret) @@ -1996,11 +2044,11 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) * We don't support binding to any address if anyone is bound to * a specific address on the same port. */ - if (cma_any_addr(&id_priv->id.route.addr.src_addr)) + if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) return -EADDRNOTAVAIL; hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { - if (cma_any_addr(&cur_id->id.route.addr.src_addr)) + if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr)) return -EADDRNOTAVAIL; cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr; @@ -2012,6 +2060,34 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) return 0; } +static int cma_get_tcp_port(struct rdma_id_private *id_priv) +{ + int ret; + int size; + struct socket *sock; + + ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret) + return ret; + ret = sock->ops->bind(sock, + (struct sockaddr *) &id_priv->id.route.addr.src_addr, + ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + if (ret) { + sock_release(sock); + return ret; + } + size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); + ret = sock->ops->getname(sock, + (struct sockaddr *) &id_priv->id.route.addr.src_addr, + &size, 0); + if (ret) { + sock_release(sock); + return ret; + } + id_priv->sock = sock; + return 0; +} + static int cma_get_port(struct rdma_id_private *id_priv) { struct idr *ps; @@ -2023,6 +2099,11 @@ static int cma_get_port(struct rdma_id_private *id_priv) break; case RDMA_PS_TCP: ps = &tcp_ps; + if (unify_tcp_port_space) { + ret = cma_get_tcp_port(id_priv); + if (ret) + goto out; + } break; case RDMA_PS_UDP: ps = &udp_ps; @@ -2035,12 +2116,12 @@ static int cma_get_port(struct rdma_id_private *id_priv) } mutex_lock(&lock); - if (cma_any_port(&id_priv->id.route.addr.src_addr)) + if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); - +out: return ret; } @@ -2126,7 +2207,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - if (cma_disable_remove(id_priv, CMA_CONNECT)) + if (cma_disable_callback(id_priv, CMA_CONNECT)) return 0; memset(&event, 0, sizeof event); @@ -2143,6 +2224,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ib_event->param.sidr_rep_rcvd.status; break; } + ret = cma_set_qkey(id_priv); + if (ret) { + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = -EINVAL; + break; + } if (id_priv->qkey != rep->qkey) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -EINVAL; @@ -2157,7 +2244,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = 0; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d", + printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -2167,12 +2254,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); return ret; } @@ -2207,8 +2294,8 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, req.path = route->path_rec; req.service_id = cma_get_service_id(id_priv->id.ps, - &route->addr.dst_addr); - req.timeout_ms = 1 << (cma_response_timeout - 8); + (struct sockaddr *) &route->addr.dst_addr); + req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); @@ -2258,7 +2345,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.alternate_path = &route->path_rec[1]; req.service_id = cma_get_service_id(id_priv->id.ps, - &route->addr.dst_addr); + (struct sockaddr *) &route->addr.dst_addr); req.qp_num = id_priv->qp_num; req.qp_type = IB_QPT_RC; req.starting_psn = id_priv->seq_num; @@ -2305,7 +2392,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; cm_id->remote_addr = *sin; - ret = cma_modify_qp_rtr(id_priv); + ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; @@ -2368,25 +2455,15 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; - struct ib_qp_attr qp_attr; - int qp_attr_mask, ret; - - if (id_priv->id.qp) { - ret = cma_modify_qp_rtr(id_priv); - if (ret) - goto out; + int ret; - qp_attr.qp_state = IB_QPS_RTS; - ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr, - &qp_attr_mask); - if (ret) - goto out; + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + goto out; - qp_attr.max_rd_atomic = conn_param->initiator_depth; - ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); - if (ret) - goto out; - } + ret = cma_modify_qp_rts(id_priv, conn_param); + if (ret) + goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; @@ -2411,7 +2488,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, struct iw_cm_conn_param iw_param; int ret; - ret = cma_modify_qp_rtr(id_priv); + ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; @@ -2432,10 +2509,14 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; + int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { + ret = cma_set_qkey(id_priv); + if (ret) + return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } @@ -2580,8 +2661,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) int ret; id_priv = mc->id_priv; - if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) && - cma_disable_remove(id_priv, CMA_ADDR_RESOLVED)) + if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) && + cma_disable_callback(id_priv, CMA_ADDR_RESOLVED)) return 0; mutex_lock(&id_priv->qp_mutex); @@ -2606,12 +2687,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, CMA_DESTROYING); - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; } - cma_enable_remove(id_priv); + mutex_unlock(&id_priv->handler_mutex); return 0; } @@ -2631,11 +2712,9 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - ip_ib_mc_map(sin->sin_addr.s_addr, mc_map); + ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ - mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8; - mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr); *mgid = *(union ib_gid *) (mc_map + 4); } } @@ -2654,7 +2733,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (ret) return ret; - cma_set_mgid(id_priv, &mc->addr, &rec.mgid); + cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); ib_addr_get_sgid(dev_addr, &rec.port_gid); @@ -2667,6 +2746,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; + if (id_priv->id.ps == RDMA_PS_IPOIB) + comp_mask |= IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_RATE_SELECTOR; + mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, @@ -2745,6 +2828,62 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_leave_multicast); +static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr; + struct cma_ndev_work *work; + + dev_addr = &id_priv->id.route.addr.dev_addr; + + if ((dev_addr->src_dev == ndev) && + memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { + printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", + ndev->name, &id_priv->id); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, cma_ndev_work_handler); + work->id = id_priv; + work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; + atomic_inc(&id_priv->refcount); + queue_work(cma_wq, &work->work); + } + + return 0; +} + +static int cma_netdev_callback(struct notifier_block *self, unsigned long event, + void *ctx) +{ + struct net_device *ndev = (struct net_device *)ctx; + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + int ret = NOTIFY_DONE; + + if (event != NETDEV_BONDING_FAILOVER) + return NOTIFY_DONE; + + if (!(ndev->flags & IFF_MASTER)) + return NOTIFY_DONE; + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) + list_for_each_entry(id_priv, &cma_dev->id_list, list) { + ret = cma_netdev_change(ndev, id_priv); + if (ret) + goto out; + } + +out: + mutex_unlock(&lock); + return ret; +} + +static struct notifier_block cma_nb = { + .notifier_call = cma_netdev_callback +}; + static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; @@ -2772,6 +2911,7 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; enum cma_state state; + int ret = 0; /* Record that we want to remove the device */ state = cma_exch(id_priv, CMA_DEVICE_REMOVAL); @@ -2779,15 +2919,18 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv) return 0; cma_cancel_operation(id_priv, state); - wait_event(id_priv->wait_remove, !atomic_read(&id_priv->dev_remove)); + mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL)) - return 0; + goto out; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; - return id_priv->id.event_handler(&id_priv->id, &event); + ret = id_priv->id.event_handler(&id_priv->id, &event); +out: + mutex_unlock(&id_priv->handler_mutex); + return ret; } static void cma_process_remove(struct cma_device *cma_dev) @@ -2849,6 +2992,7 @@ static int cma_init(void) ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); + register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) @@ -2856,6 +3000,7 @@ static int cma_init(void) return 0; err: + unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); @@ -2865,6 +3010,7 @@ err: static void cma_cleanup(void) { ib_unregister_client(&cma_client); + unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 7ad47a4..05ac36e 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -28,8 +28,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $ */ #ifndef _CORE_PRIV_H diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 858b6ea..179f753 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: device.c 1349 2004-12-16 21:09:43Z roland $ */ #include <linux/module.h> @@ -180,9 +178,14 @@ static int end_port(struct ib_device *device) */ struct ib_device *ib_alloc_device(size_t size) { + struct ib_device *ibdev; + BUG_ON(size < sizeof (struct ib_device)); - return kzalloc(size, GFP_KERNEL); + ibdev = kzalloc(size, GFP_KERNEL); + if (ibdev) + mutex_init(&ibdev->sysfs_mutex); + return ibdev; } EXPORT_SYMBOL(ib_alloc_device); @@ -313,9 +316,10 @@ int ib_register_device(struct ib_device *device) goto out; } + mutex_lock(&device->sysfs_mutex); list_add_tail(&device->core_list, &device_list); - device->reg_state = IB_DEV_REGISTERED; + mutex_unlock(&device->sysfs_mutex); { struct ib_client *client; @@ -361,7 +365,9 @@ void ib_unregister_device(struct ib_device *device) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); + mutex_lock(&device->sysfs_mutex); device->reg_state = IB_DEV_UNREGISTERED; + mutex_unlock(&device->sysfs_mutex); } EXPORT_SYMBOL(ib_unregister_device); @@ -727,6 +733,10 @@ static int __init ib_core_init(void) dma_map_sg_hp_wa = 1; #endif + ret = init_mnt_writers(); + if (ret) + printk(KERN_WARNING "Couldn't init mnt_writers\n"); + ret = ib_sysfs_setup(); if (ret) printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index bbfc713..4507043 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: fmr_pool.c 2730 2005-06-28 16:43:03Z sean.hefty $ */ #include <linux/errno.h> @@ -158,8 +156,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #endif } - list_splice(&pool->dirty_list, &unmap_list); - INIT_LIST_HEAD(&pool->dirty_list); + list_splice_init(&pool->dirty_list, &unmap_list); pool->dirty_len = 0; spin_unlock_irq(&pool->pool_lock); @@ -182,8 +179,7 @@ static int ib_fmr_cleanup_thread(void *pool_ptr) struct ib_fmr_pool *pool = pool_ptr; do { - if (pool->dirty_len >= pool->dirty_watermark || - atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { + if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { ib_fmr_batch_release(pool); atomic_inc(&pool->flush_ser); @@ -194,8 +190,7 @@ static int ib_fmr_cleanup_thread(void *pool_ptr) } set_current_state(TASK_INTERRUPTIBLE); - if (pool->dirty_len < pool->dirty_watermark && - atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && + if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && !kthread_should_stop()) schedule(); __set_current_state(TASK_RUNNING); @@ -308,10 +303,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, .max_maps = pool->max_remaps, .page_shift = params->page_shift }; + int bytes_per_fmr = sizeof *fmr; + + if (pool->cache_bucket) + bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); for (i = 0; i < params->pool_size; ++i) { - fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64), - GFP_KERNEL); + fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); if (!fmr) { printk(KERN_WARNING PFX "failed to allocate fmr " "struct for FMR %d\n", i); @@ -526,8 +524,10 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) list_add_tail(&fmr->list, &pool->free_list); } else { list_add_tail(&fmr->list, &pool->dirty_list); - ++pool->dirty_len; - wake_up_process(pool->thread); + if (++pool->dirty_len >= pool->dirty_watermark) { + atomic_inc(&pool->req_ser); + wake_up_process(pool->thread); + } } } diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 81c9195..8f9509e 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -942,8 +942,7 @@ static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; - qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE| + qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; diff --git a/drivers/infiniband/core/local_sa.c b/drivers/infiniband/core/local_sa.c index 49ca262..eb62c42 100644 --- a/drivers/infiniband/core/local_sa.c +++ b/drivers/infiniband/core/local_sa.c @@ -978,7 +978,7 @@ static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter, static void report_path(struct work_struct *work) { struct sa_path_request *req; - + req = container_of(work, struct sa_path_request, work); req->callback(0, &req->path_rec, req->context); ib_sa_client_put(req->client); @@ -1058,7 +1058,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, req->context = context; ib_sa_client_get(client); - queue_work(sa_wq, &req->work); + queue_work(sa_wq, &req->work); *sa_query = ERR_PTR(-EEXIST); return 0; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 4402d03..7934a4c 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -301,6 +301,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; + spin_lock_init(&mad_agent_priv->lock); + INIT_LIST_HEAD(&mad_agent_priv->send_list); + INIT_LIST_HEAD(&mad_agent_priv->wait_list); + INIT_LIST_HEAD(&mad_agent_priv->done_list); + INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); + INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); + INIT_LIST_HEAD(&mad_agent_priv->local_list); + INIT_WORK(&mad_agent_priv->local_work, local_completions); + atomic_set(&mad_agent_priv->refcount, 1); + init_completion(&mad_agent_priv->comp); spin_lock_irqsave(&port_priv->reg_lock, flags); mad_agent_priv->agent.hi_tid = ++ib_mad_client_id; @@ -350,17 +360,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list); spin_unlock_irqrestore(&port_priv->reg_lock, flags); - spin_lock_init(&mad_agent_priv->lock); - INIT_LIST_HEAD(&mad_agent_priv->send_list); - INIT_LIST_HEAD(&mad_agent_priv->wait_list); - INIT_LIST_HEAD(&mad_agent_priv->done_list); - INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); - INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); - INIT_LIST_HEAD(&mad_agent_priv->local_list); - INIT_WORK(&mad_agent_priv->local_work, local_completions); - atomic_set(&mad_agent_priv->refcount, 1); - init_completion(&mad_agent_priv->comp); - return &mad_agent_priv->agent; error4: @@ -747,9 +746,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, break; case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: kmem_cache_free(ib_mad_cache, mad_priv); - kfree(local); - ret = 1; - goto out; + break; case IB_MAD_RESULT_SUCCESS: /* Treat like an incoming receive MAD */ port_priv = ib_get_mad_port(mad_agent_priv->agent.device, @@ -760,10 +757,12 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, &mad_priv->mad.mad); } if (!port_priv || !recv_mad_agent) { + /* + * No receiving agent so drop packet and + * generate send completion. + */ kmem_cache_free(ib_mad_cache, mad_priv); - kfree(local); - ret = 0; - goto out; + break; } local->mad_priv = mad_priv; local->recv_mad_agent = recv_mad_agent; @@ -1102,7 +1101,9 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid; /* Timeout will be updated after send completes */ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms); - mad_send_wr->retries = send_buf->retries; + mad_send_wr->max_retries = send_buf->retries; + mad_send_wr->retries_left = send_buf->retries; + send_buf->retries = 0; /* Reference for work request to QP + response */ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); mad_send_wr->status = IB_WC_SUCCESS; @@ -1695,9 +1696,8 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; - send_resp = ((struct ib_mad *)(wr->send_buf.mad))-> - mad_hdr.method & IB_MGMT_METHOD_RESP; - rcv_resp = rwc->recv_buf.mad->mad_hdr.method & IB_MGMT_METHOD_RESP; + send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad); + rcv_resp = ib_response_mad(rwc->recv_buf.mad); if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ @@ -1933,15 +1933,6 @@ local: if (port_priv->device->process_mad) { int ret; - if (!response) { - printk(KERN_ERR PFX "No memory for response MAD\n"); - /* - * Is it better to assume that - * it wouldn't be processed ? - */ - goto out; - } - ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, @@ -2368,7 +2359,7 @@ static void local_completions(struct work_struct *work) struct ib_mad_local_private *local; struct ib_mad_agent_private *recv_mad_agent; unsigned long flags; - int recv = 0; + int free_mad; struct ib_wc wc; struct ib_mad_send_wc mad_send_wc; @@ -2382,14 +2373,15 @@ static void local_completions(struct work_struct *work) completion_list); list_del(&local->completion_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + free_mad = 0; if (local->mad_priv) { recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); + free_mad = 1; goto local_send_completion; } - recv = 1; /* * Defined behavior is to complete response * before request @@ -2434,7 +2426,7 @@ local_send_completion: spin_lock_irqsave(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); - if (!recv) + if (free_mad) kmem_cache_free(ib_mad_cache, local->mad_priv); kfree(local); } @@ -2445,9 +2437,12 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) { int ret; - if (!mad_send_wr->retries--) + if (!mad_send_wr->retries_left) return -ETIMEDOUT; + mad_send_wr->retries_left--; + mad_send_wr->send_buf.retries++; + mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); if (mad_send_wr->mad_agent_priv->agent.rmpp_version) { diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 9be5cc0..05ce331 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: mad_priv.h 5596 2006-03-03 01:00:07Z sean.hefty $ */ #ifndef __IB_MAD_PRIV_H__ @@ -131,7 +129,8 @@ struct ib_mad_send_wr_private { struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; __be64 tid; unsigned long timeout; - int retries; + int max_retries; + int retries_left; int retry; int refcount; enum ib_wc_status status; diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index d43bc62..3af2b84 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: mad_rmpp.c 1921 2005-03-02 22:58:44Z sean.hefty $ */ #include "mad_priv.h" @@ -135,7 +133,7 @@ static void ack_recv(struct mad_rmpp_recv *rmpp_recv, msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, hdr_len, 0, GFP_KERNEL); - if (!msg) + if (IS_ERR(msg)) return; format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv); @@ -684,7 +682,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, if (seg_num > mad_send_wr->last_ack) { adjust_last_ack(mad_send_wr, seg_num); - mad_send_wr->retries = mad_send_wr->send_buf.retries; + mad_send_wr->retries_left = mad_send_wr->max_retries; } mad_send_wr->newwin = newwin; if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h index f0616fd..3d336bf 100644 --- a/drivers/infiniband/core/mad_rmpp.h +++ b/drivers/infiniband/core/mad_rmpp.h @@ -28,8 +28,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: mad_rmpp.h 1921 2005-02-25 22:58:44Z sean.hefty $ */ #ifndef __MAD_RMPP_H__ diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index dda056c..ca95426 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -73,11 +73,20 @@ struct mcast_device { }; enum mcast_state { - MCAST_IDLE, MCAST_JOINING, MCAST_MEMBER, + MCAST_ERROR, +}; + +enum mcast_group_state { + MCAST_IDLE, MCAST_BUSY, - MCAST_ERROR + MCAST_GROUP_ERROR, + MCAST_PKEY_EVENT +}; + +enum { + MCAST_INVALID_PKEY_INDEX = 0xFFFF }; struct mcast_member; @@ -93,9 +102,12 @@ struct mcast_group { struct mcast_member *last_join; int members[3]; atomic_t refcount; - enum mcast_state state; + enum mcast_group_state state; struct ib_sa_query *query; int query_id; + u16 pkey_index; + u8 leave_state; + int retries; }; struct mcast_member { @@ -312,6 +324,7 @@ static int send_leave(struct mcast_group *group, u8 leave_state) rec = group->rec; rec.join_state = leave_state; + group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, @@ -350,9 +363,19 @@ static int fail_join(struct mcast_group *group, struct mcast_member *member, static void process_group_error(struct mcast_group *group) { struct mcast_member *member; - int ret; + int ret = 0; + u16 pkey_index; + + if (group->state == MCAST_PKEY_EVENT) + ret = ib_find_pkey(group->port->dev->device, + group->port->port_num, + be16_to_cpu(group->rec.pkey), &pkey_index); spin_lock_irq(&group->lock); + if (group->state == MCAST_PKEY_EVENT && !ret && + group->pkey_index == pkey_index) + goto out; + while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); @@ -371,6 +394,7 @@ static void process_group_error(struct mcast_group *group) } group->rec.join_state = 0; +out: group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); } @@ -387,9 +411,9 @@ static void mcast_work_handler(struct work_struct *work) retest: spin_lock_irq(&group->lock); while (!list_empty(&group->pending_list) || - (group->state == MCAST_ERROR)) { + (group->state != MCAST_BUSY)) { - if (group->state == MCAST_ERROR) { + if (group->state != MCAST_BUSY) { spin_unlock_irq(&group->lock); process_group_error(group); goto retest; @@ -466,12 +490,19 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; + u16 pkey_index = MCAST_INVALID_PKEY_INDEX; if (status) process_join_error(group, status); else { + ib_find_pkey(group->port->dev->device, group->port->port_num, + be16_to_cpu(rec->pkey), &pkey_index); + spin_lock_irq(&group->port->lock); group->rec = *rec; + if (group->state == MCAST_BUSY && + group->pkey_index == MCAST_INVALID_PKEY_INDEX) + group->pkey_index = pkey_index; if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { rb_erase(&group->node, &group->port->table); mcast_insert(group->port, group, 1); @@ -486,7 +517,11 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, { struct mcast_group *group = context; - mcast_work_handler(&group->work); + if (status && (group->retries > 0) && + !send_leave(group, group->leave_state)) + group->retries--; + else + mcast_work_handler(&group->work); } static struct mcast_group *acquire_group(struct mcast_port *port, @@ -509,8 +544,10 @@ static struct mcast_group *acquire_group(struct mcast_port *port, if (!group) return NULL; + group->retries = 3; group->port = port; group->rec.mgid = *mgid; + group->pkey_index = MCAST_INVALID_PKEY_INDEX; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->active_list); INIT_WORK(&group->work, mcast_work_handler); @@ -679,7 +716,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, } EXPORT_SYMBOL(ib_init_ah_from_mcmember); -static void mcast_groups_lost(struct mcast_port *port) +static void mcast_groups_event(struct mcast_port *port, + enum mcast_group_state state) { struct mcast_group *group; struct rb_node *node; @@ -693,7 +731,8 @@ static void mcast_groups_lost(struct mcast_port *port) atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } - group->state = MCAST_ERROR; + if (group->state != MCAST_GROUP_ERROR) + group->state = state; spin_unlock(&group->lock); } spin_unlock_irqrestore(&port->lock, flags); @@ -703,16 +742,20 @@ static void mcast_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct mcast_device *dev; + int index; dev = container_of(handler, struct mcast_device, event_handler); + index = event->element.port_num - dev->start_port; switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: - mcast_groups_lost(&dev->port[event->element.port_num - - dev->start_port]); + mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); + break; + case IB_EVENT_PKEY_CHANGE: + mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); break; default: break; diff --git a/drivers/infiniband/core/namespace.c b/drivers/infiniband/core/namespace.c new file mode 100644 index 0000000..a33f9e9 --- /dev/null +++ b/drivers/infiniband/core/namespace.c @@ -0,0 +1,95 @@ +#include <linux/spinlock_types.h> +#include <linux/percpu.h> +#include <linux/mount.h> +#include <linux/module.h> + +struct mnt_writer { + /* + * If holding multiple instances of this lock, they + * must be ordered by cpu number. + */ + spinlock_t lock; + struct lock_class_key lock_class; /* compiles out with !lockdep */ + unsigned long count; + struct vfsmount *mnt; +} ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); + +int __init init_mnt_writers(void) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); + spin_lock_init(&writer->lock); + lockdep_set_class(&writer->lock, &writer->lock_class); + writer->count = 0; + } + return 0; +} + +static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) +{ + if (!cpu_writer->mnt) + return; + /* + * This is in case anyone ever leaves an invalid, + * old ->mnt and a count of 0. + */ + if (!cpu_writer->count) + return; + cpu_writer->count = 0; +} + +static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, + struct vfsmount *mnt) +{ + if (cpu_writer->mnt == mnt) + return; + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = mnt; +} + +int mnt_want_write(struct vfsmount *mnt) +{ + int ret = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + if (__mnt_is_readonly(mnt)) { + ret = -EROFS; + goto out; + } + use_cpu_writer_for_mount(cpu_writer, mnt); + cpu_writer->count++; +out: + spin_unlock(&cpu_writer->lock); + put_cpu_var(mnt_writers); + return ret; +} +EXPORT_SYMBOL(mnt_want_write); + +void mnt_drop_write(struct vfsmount *mnt) +{ + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + + use_cpu_writer_for_mount(cpu_writer, mnt); + if (cpu_writer->count > 0) { + cpu_writer->count--; + } + + spin_unlock(&cpu_writer->lock); + /* + * This could be done right after the spinlock + * is taken because the spinlock keeps us on + * the cpu, and disables preemption. However, + * putting it here bounds the amount that + * __mnt_writers can underflow. Without it, + * we could theoretically wrap __mnt_writers. + */ + put_cpu_var(mnt_writers); +} +EXPORT_SYMBOL(mnt_drop_write); diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c index c972d72..019bd4b 100644 --- a/drivers/infiniband/core/packer.c +++ b/drivers/infiniband/core/packer.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $ */ #include <linux/string.h> diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a2b1c4a..59ebdd7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: sa_query.c 2811 2005-07-06 18:11:43Z halr $ */ #include <linux/module.h> @@ -561,6 +559,8 @@ static void update_sm_ah(struct work_struct *work) } spin_lock_irq(&port->ah_lock); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = new_ah; spin_unlock_irq(&port->ah_lock); @@ -1188,6 +1188,7 @@ static void ib_sa_notice_resp(struct ib_sa_port *port, struct ib_mad_send_buf *mad_buf; struct ib_sa_mad *mad; int ret; + unsigned long flags; mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, @@ -1199,11 +1200,16 @@ static void ib_sa_notice_resp(struct ib_sa_port *port, memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad); mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP; - spin_lock_irq(&port->ah_lock); + spin_lock_irqsave(&port->ah_lock, flags); + if (!port->sm_ah) { + spin_unlock_irqrestore(&port->ah_lock, flags); + ib_free_send_mad(mad_buf); + return; + } kref_get(&port->sm_ah->ref); mad_buf->context[0] = &port->sm_ah->ref; mad_buf->ah = port->sm_ah->ah; - spin_unlock_irq(&port->ah_lock); + spin_unlock_irqrestore(&port->ah_lock, flags); ret = ib_post_send_mad(mad_buf, NULL); if (ret) @@ -1392,7 +1398,8 @@ static void ib_sa_remove_one(struct ib_device *device) for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { ib_unregister_mad_agent(sa_dev->port[i].notice_agent); ib_unregister_mad_agent(sa_dev->port[i].agent); - kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); + if (sa_dev->port[i].sm_ah) + kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); } kfree(sa_dev); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 70b77ae..1091946 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: sysfs.c 1349 2004-12-16 21:09:43Z roland $ */ #include "core_priv.h" @@ -96,7 +94,7 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, char *buf) { struct ib_port_attr attr; - ssize_t ret; + ssize_t ret = -ENODEV; static const char *state_name[] = { [IB_PORT_NOP] = "NOP", @@ -107,26 +105,33 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" }; - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d: %s\n", attr.state, - attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : "UNKNOWN"); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) + ret = sprintf(buf, "%d: %s\n", attr.state, + attr.state >= 0 && + attr.state < ARRAY_SIZE(state_name) ? + state_name[attr.state] : "UNKNOWN"); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, char *buf) { struct ib_port_attr attr; - ssize_t ret; + ssize_t ret = -ENODEV; - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%x\n", attr.lid); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) + ret = sprintf(buf, "0x%x\n", attr.lid); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t lid_mask_count_show(struct ib_port *p, @@ -134,52 +139,64 @@ static ssize_t lid_mask_count_show(struct ib_port *p, char *buf) { struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; + ssize_t ret = -ENODEV; - return sprintf(buf, "%d\n", attr.lmc); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) + ret = sprintf(buf, "%d\n", attr.lmc); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, char *buf) { struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; + ssize_t ret = -ENODEV; - return sprintf(buf, "0x%x\n", attr.sm_lid); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) + ret = sprintf(buf, "0x%x\n", attr.sm_lid); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, char *buf) { struct ib_port_attr attr; - ssize_t ret; + ssize_t ret = -ENODEV; - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d\n", attr.sm_sl); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) + ret = sprintf(buf, "%d\n", attr.sm_sl); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, char *buf) { struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; + ssize_t ret = -ENODEV; - return sprintf(buf, "0x%08x\n", attr.port_cap_flags); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) + ret = sprintf(buf, "0x%08x\n", attr.port_cap_flags); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, @@ -188,24 +205,33 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, struct ib_port_attr attr; char *speed = ""; int rate; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - switch (attr.active_speed) { - case 2: speed = " DDR"; break; - case 4: speed = " QDR"; break; + ssize_t ret = -ENODEV; + + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) { + switch (attr.active_speed) { + case 2: speed = " DDR"; break; + case 4: speed = " QDR"; break; + } + + rate = 25 * ib_width_enum_to_int(attr.active_width) * + attr.active_speed; + if (rate < 0) { + ret = -EINVAL; + goto out; + } + + ret = sprintf(buf, "%d%s Gb/sec (%dX%s)\n", + rate / 10, rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), + speed); + } } - - rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed; - if (rate < 0) - return -EINVAL; - - return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); +out: + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, @@ -213,22 +239,26 @@ static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, { struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - switch (attr.phys_state) { - case 1: return sprintf(buf, "1: Sleep\n"); - case 2: return sprintf(buf, "2: Polling\n"); - case 3: return sprintf(buf, "3: Disabled\n"); - case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); - case 5: return sprintf(buf, "5: LinkUp\n"); - case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); - case 7: return sprintf(buf, "7: Phy Test\n"); - default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state); + ssize_t ret = -ENODEV; + + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (!ret) { + switch (attr.phys_state) { + case 1: ret = sprintf(buf, "1: Sleep\n"); + case 2: ret = sprintf(buf, "2: Polling\n"); + case 3: ret = sprintf(buf, "3: Disabled\n"); + case 4: ret = sprintf(buf, "4: PortConfigurationTraining\n"); + case 5: ret = sprintf(buf, "5: LinkUp\n"); + case 6: ret = sprintf(buf, "6: LinkErrorRecovery\n"); + case 7: ret = sprintf(buf, "7: Phy Test\n"); + default: ret = sprintf(buf, "%d: <unknown>\n", attr.phys_state); + } + } } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static PORT_ATTR_RO(state); @@ -258,21 +288,24 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); union ib_gid gid; - ssize_t ret; - - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); - if (ret) - return ret; - - return sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) gid.raw)[0]), - be16_to_cpu(((__be16 *) gid.raw)[1]), - be16_to_cpu(((__be16 *) gid.raw)[2]), - be16_to_cpu(((__be16 *) gid.raw)[3]), - be16_to_cpu(((__be16 *) gid.raw)[4]), - be16_to_cpu(((__be16 *) gid.raw)[5]), - be16_to_cpu(((__be16 *) gid.raw)[6]), - be16_to_cpu(((__be16 *) gid.raw)[7])); + ssize_t ret = -ENODEV; + + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); + if (!ret) + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, @@ -281,13 +314,16 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); u16 pkey; - ssize_t ret; - - ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); - if (ret) - return ret; + ssize_t ret = -ENODEV; - return sprintf(buf, "0x%04x\n", pkey); + mutex_lock(&p->ibdev->sysfs_mutex); + if (ibdev_is_alive(p->ibdev)) { + ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); + if (!ret) + ret = sprintf(buf, "0x%04x\n", pkey); + } + mutex_unlock(&p->ibdev->sysfs_mutex); + return ret; } #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ @@ -310,6 +346,12 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, if (!p->ibdev->process_mad) return sprintf(buf, "N/A (no PMA)\n"); + mutex_lock(&p->ibdev->sysfs_mutex); + if (!ibdev_is_alive(p->ibdev)) { + ret = -ENODEV; + goto out; + } + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) { @@ -356,7 +398,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, out: kfree(in_mad); kfree(out_mad); - + mutex_unlock(&p->ibdev->sysfs_mutex); return ret; } @@ -376,6 +418,12 @@ static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +/* + * There is no bit allocated for port_xmit_wait in the CounterSelect field + * (IB spec). However, since this bit is ignored when reading + * (show_pma_counter), the _counter field of port_xmit_wait can be set to zero. + */ +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); static struct attribute *pma_attrs[] = { &port_pma_attr_symbol_error.attr.attr, @@ -394,6 +442,7 @@ static struct attribute *pma_attrs[] = { &port_pma_attr_port_rcv_data.attr.attr, &port_pma_attr_port_xmit_packets.attr.attr, &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, NULL }; @@ -511,19 +560,10 @@ static int add_port(struct ib_device *device, int port_num) p->ibdev = device; p->port_num = port_num; - p->kobj.ktype = &port_type; - - p->kobj.parent = kobject_get(&device->ports_parent); - if (!p->kobj.parent) { - ret = -EBUSY; - goto err; - } - - ret = kobject_set_name(&p->kobj, "%d", port_num); - if (ret) - goto err_put; - ret = kobject_register(&p->kobj); + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(device->ports_parent), + "%d", port_num); if (ret) goto err_put; @@ -552,6 +592,7 @@ static int add_port(struct ib_device *device, int port_num) list_add_tail(&p->kobj.entry, &device->port_list); + kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err_free_pkey: @@ -573,9 +614,7 @@ err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); err_put: - kobject_put(&device->ports_parent); - -err: + kobject_put(device->ports_parent); kfree(p); return ret; } @@ -600,20 +639,20 @@ static ssize_t show_sys_image_guid(struct class_device *cdev, char *buf) { struct ib_device *dev = container_of(cdev, struct ib_device, class_dev); struct ib_device_attr attr; - ssize_t ret; - - if (!ibdev_is_alive(dev)) - return -ENODEV; - - ret = ib_query_device(dev, &attr); - if (ret) - return ret; - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); + ssize_t ret = -ENODEV; + + mutex_lock(&dev->sysfs_mutex); + if (ibdev_is_alive(dev)) { + ret = ib_query_device(dev, &attr); + if (!ret) + ret = sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); + } + mutex_unlock(&dev->sysfs_mutex); + return ret; } static ssize_t show_node_guid(struct class_device *cdev, char *buf) @@ -642,24 +681,26 @@ static ssize_t set_node_desc(struct class_device *cdev, const char *buf, { struct ib_device *dev = container_of(cdev, struct ib_device, class_dev); struct ib_device_modify desc = {}; - int ret; + int ret = -ENODEV; if (!dev->modify_device) return -EIO; memcpy(desc.node_desc, buf, min_t(int, count, 64)); - ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); - if (ret) - return ret; - - return count; + mutex_lock(&dev->sysfs_mutex); + if (ibdev_is_alive(dev)) { + ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); + if (!ret) + ret = count; + } + mutex_unlock(&dev->sysfs_mutex); + return ret; } static CLASS_DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); static CLASS_DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); static CLASS_DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static CLASS_DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, - set_node_desc); +static CLASS_DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); static struct class_device_attribute *ib_class_attributes[] = { &class_device_attr_node_type, @@ -674,6 +715,124 @@ static struct class ib_class = { .uevent = ib_device_uevent, }; +/* Show a given an attribute in the statistics group */ +static ssize_t show_protocol_stat(struct class_device *cdev, + char *buf, + unsigned offset) +{ + struct ib_device *dev = container_of(cdev, struct ib_device, class_dev); + union rdma_protocol_stats stats; + ssize_t ret = -ENODEV; + + mutex_lock(&dev->sysfs_mutex); + if (ibdev_is_alive(dev)) { + ret = dev->get_protocol_stats(dev, &stats); + if (!ret) + ret = sprintf(buf, "%llu\n", + (unsigned long long) + ((u64 *) &stats)[offset]); + } + mutex_unlock(&dev->sysfs_mutex); + return ret; +} + +/* generate a read-only iwarp statistics attribute */ +#define IW_STATS_ENTRY(name) \ +static ssize_t show_##name(struct class_device *cdev, \ + char *buf) \ +{ \ + return show_protocol_stat(cdev, buf, \ + offsetof(struct iw_protocol_stats, name) / \ + sizeof (u64)); \ +} \ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +IW_STATS_ENTRY(ipInReceives); +IW_STATS_ENTRY(ipInHdrErrors); +IW_STATS_ENTRY(ipInTooBigErrors); +IW_STATS_ENTRY(ipInNoRoutes); +IW_STATS_ENTRY(ipInAddrErrors); +IW_STATS_ENTRY(ipInUnknownProtos); +IW_STATS_ENTRY(ipInTruncatedPkts); +IW_STATS_ENTRY(ipInDiscards); +IW_STATS_ENTRY(ipInDelivers); +IW_STATS_ENTRY(ipOutForwDatagrams); +IW_STATS_ENTRY(ipOutRequests); +IW_STATS_ENTRY(ipOutDiscards); +IW_STATS_ENTRY(ipOutNoRoutes); +IW_STATS_ENTRY(ipReasmTimeout); +IW_STATS_ENTRY(ipReasmReqds); +IW_STATS_ENTRY(ipReasmOKs); +IW_STATS_ENTRY(ipReasmFails); +IW_STATS_ENTRY(ipFragOKs); +IW_STATS_ENTRY(ipFragFails); +IW_STATS_ENTRY(ipFragCreates); +IW_STATS_ENTRY(ipInMcastPkts); +IW_STATS_ENTRY(ipOutMcastPkts); +IW_STATS_ENTRY(ipInBcastPkts); +IW_STATS_ENTRY(ipOutBcastPkts); +IW_STATS_ENTRY(tcpRtoAlgorithm); +IW_STATS_ENTRY(tcpRtoMin); +IW_STATS_ENTRY(tcpRtoMax); +IW_STATS_ENTRY(tcpMaxConn); +IW_STATS_ENTRY(tcpActiveOpens); +IW_STATS_ENTRY(tcpPassiveOpens); +IW_STATS_ENTRY(tcpAttemptFails); +IW_STATS_ENTRY(tcpEstabResets); +IW_STATS_ENTRY(tcpCurrEstab); +IW_STATS_ENTRY(tcpInSegs); +IW_STATS_ENTRY(tcpOutSegs); +IW_STATS_ENTRY(tcpRetransSegs); +IW_STATS_ENTRY(tcpInErrs); +IW_STATS_ENTRY(tcpOutRsts); + +static struct attribute *iw_proto_stats_attrs[] = { + &class_device_attr_ipInReceives.attr, + &class_device_attr_ipInHdrErrors.attr, + &class_device_attr_ipInTooBigErrors.attr, + &class_device_attr_ipInNoRoutes.attr, + &class_device_attr_ipInAddrErrors.attr, + &class_device_attr_ipInUnknownProtos.attr, + &class_device_attr_ipInTruncatedPkts.attr, + &class_device_attr_ipInDiscards.attr, + &class_device_attr_ipInDelivers.attr, + &class_device_attr_ipOutForwDatagrams.attr, + &class_device_attr_ipOutRequests.attr, + &class_device_attr_ipOutDiscards.attr, + &class_device_attr_ipOutNoRoutes.attr, + &class_device_attr_ipReasmTimeout.attr, + &class_device_attr_ipReasmReqds.attr, + &class_device_attr_ipReasmOKs.attr, + &class_device_attr_ipReasmFails.attr, + &class_device_attr_ipFragOKs.attr, + &class_device_attr_ipFragFails.attr, + &class_device_attr_ipFragCreates.attr, + &class_device_attr_ipInMcastPkts.attr, + &class_device_attr_ipOutMcastPkts.attr, + &class_device_attr_ipInBcastPkts.attr, + &class_device_attr_ipOutBcastPkts.attr, + &class_device_attr_tcpRtoAlgorithm.attr, + &class_device_attr_tcpRtoMin.attr, + &class_device_attr_tcpRtoMax.attr, + &class_device_attr_tcpMaxConn.attr, + &class_device_attr_tcpActiveOpens.attr, + &class_device_attr_tcpPassiveOpens.attr, + &class_device_attr_tcpAttemptFails.attr, + &class_device_attr_tcpEstabResets.attr, + &class_device_attr_tcpCurrEstab.attr, + &class_device_attr_tcpInSegs.attr, + &class_device_attr_tcpOutSegs.attr, + &class_device_attr_tcpRetransSegs.attr, + &class_device_attr_tcpInErrs.attr, + &class_device_attr_tcpOutRsts.attr, + NULL +}; + +static struct attribute_group iw_stats_group = { + .name = "proto_stats", + .attrs = iw_proto_stats_attrs, +}; + int ib_device_register_sysfs(struct ib_device *device) { struct class_device *class_dev = &device->class_dev; @@ -697,17 +856,12 @@ int ib_device_register_sysfs(struct ib_device *device) goto err_unregister; } - device->ports_parent.parent = kobject_get(&class_dev->kobj); - if (!device->ports_parent.parent) { - ret = -EBUSY; - goto err_unregister; - } - ret = kobject_set_name(&device->ports_parent, "ports"); - if (ret) - goto err_put; - ret = kobject_register(&device->ports_parent); - if (ret) + device->ports_parent = kobject_create_and_add("ports", + kobject_get(&class_dev->kobj)); + if (!device->ports_parent) { + ret = -ENOMEM; goto err_put; + } if (device->node_type == RDMA_NODE_IB_SWITCH) { ret = add_port(device, 0); @@ -721,6 +875,12 @@ int ib_device_register_sysfs(struct ib_device *device) } } + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { + ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); + if (ret) + goto err_put; + } + return 0; err_put: @@ -738,7 +898,7 @@ err_put: } } - kobject_put(&class_dev->kobj); + kobject_unregister(&class_dev->kobj); err_unregister: class_device_unregister(class_dev); @@ -761,7 +921,9 @@ void ib_device_unregister_sysfs(struct ib_device *device) kobject_unregister(p); } - kobject_unregister(&device->ports_parent); + kobject_put(device->ports_parent); + /* WA for memory leak */ + kfree(device->ports_parent); class_device_unregister(&device->class_dev); } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 424983f..1c967e1 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ucm.c 4311 2005-12-05 18:42:01Z sean.hefty $ */ #include <linux/completion.h> @@ -106,6 +104,9 @@ enum { IB_UCM_MAX_DEVICES = 32 }; +/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */ +extern struct class cm_class; + #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) static void ib_ucm_add_one(struct ib_device *device); @@ -1152,6 +1153,14 @@ static unsigned int ib_ucm_poll(struct file *filp, return mask; } +/* + * ib_ucm_open() does not need the BKL: + * + * - no global state is referred to; + * - there is no ioctl method to race against; + * - no further module initialization is required for open to work + * after the device is registered. + */ static int ib_ucm_open(struct inode *inode, struct file *filp) { struct ib_ucm_file *file; @@ -1199,7 +1208,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) return 0; } -static void ib_ucm_release_class_dev(struct class_device *class_dev) +static void ucm_release_class_dev(struct class_device *class_dev) { struct ib_ucm_device *dev; @@ -1217,11 +1226,6 @@ static const struct file_operations ucm_fops = { .poll = ib_ucm_poll, }; -static struct class ucm_class = { - .name = "infiniband_cm", - .release = ib_ucm_release_class_dev -}; - static ssize_t show_ibdev(struct class_device *class_dev, char *buf) { struct ib_ucm_device *dev; @@ -1257,9 +1261,10 @@ static void ib_ucm_add_one(struct ib_device *device) if (cdev_add(&ucm_dev->dev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1)) goto err; - ucm_dev->class_dev.class = &ucm_class; + ucm_dev->class_dev.class = &cm_class; ucm_dev->class_dev.dev = device->dma_device; ucm_dev->class_dev.devt = ucm_dev->dev.dev; + ucm_dev->class_dev.release = ucm_release_class_dev; snprintf(ucm_dev->class_dev.class_id, BUS_ID_SIZE, "ucm%d", ucm_dev->devnum); if (class_device_register(&ucm_dev->class_dev)) @@ -1306,40 +1311,34 @@ static int __init ib_ucm_init(void) "infiniband_cm"); if (ret) { printk(KERN_ERR "ucm: couldn't register device number\n"); - goto err; + goto error1; } - ret = class_register(&ucm_class); - if (ret) { - printk(KERN_ERR "ucm: couldn't create class infiniband_cm\n"); - goto err_chrdev; - } - - ret = class_create_file(&ucm_class, &class_attr_abi_version); + ret = class_create_file(&cm_class, &class_attr_abi_version); if (ret) { printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); - goto err_class; + goto error2; } ret = ib_register_client(&ucm_client); if (ret) { printk(KERN_ERR "ucm: couldn't register client\n"); - goto err_class; + goto error3; } return 0; -err_class: - class_unregister(&ucm_class); -err_chrdev: +error3: + class_remove_file(&cm_class, &class_attr_abi_version); +error2: unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); -err: +error1: return ret; } static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); - class_unregister(&ucm_class); + class_remove_file(&cm_class, &class_attr_abi_version); unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); idr_destroy(&ctx_id_table); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ef52dbb..31f3671 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -31,6 +31,7 @@ */ #include <linux/completion.h> +#include <linux/file.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/idr.h> @@ -80,9 +81,7 @@ struct ucma_multicast { u64 uid; struct list_head list; - struct sockaddr addr; - u8 pad[sizeof(struct sockaddr_in6) - - sizeof(struct sockaddr)]; + struct sockaddr_storage addr; }; struct ucma_event { @@ -602,18 +601,18 @@ static ssize_t ucma_query_route(struct ucma_file *file, return PTR_ERR(ctx); memset(&resp, 0, sizeof resp); - addr = &ctx->cm_id->route.addr.src_addr; + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); - addr = &ctx->cm_id->route.addr.dst_addr; + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); if (!ctx->cm_id->device) goto out; - resp.node_guid = ctx->cm_id->device->node_guid; + resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { case RDMA_TRANSPORT_IB: @@ -905,14 +904,14 @@ static ssize_t ucma_join_multicast(struct ucma_file *file, mutex_lock(&file->mut); mc = ucma_alloc_multicast(ctx); - if (IS_ERR(mc)) { - ret = PTR_ERR(mc); + if (mc == NULL) { + ret = -ENOMEM; goto err1; } mc->uid = cmd.uid; memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr); - ret = rdma_join_multicast(ctx->cm_id, &mc->addr, mc); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); if (ret) goto err2; @@ -928,7 +927,7 @@ static ssize_t ucma_join_multicast(struct ucma_file *file, return 0; err3: - rdma_leave_multicast(ctx->cm_id, &mc->addr); + rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); ucma_cleanup_mc_events(mc); err2: mutex_lock(&mut); @@ -974,7 +973,7 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, goto out; } - rdma_leave_multicast(mc->ctx->cm_id, &mc->addr); + rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_lock(&mc->ctx->file->mut); ucma_cleanup_mc_events(mc); list_del(&mc->list); @@ -991,6 +990,96 @@ out: return ret; } +static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) +{ + /* Acquire mutex's based on pointer comparison to prevent deadlock. */ + if (file1 < file2) { + mutex_lock(&file1->mut); + mutex_lock(&file2->mut); + } else { + mutex_lock(&file2->mut); + mutex_lock(&file1->mut); + } +} + +static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) +{ + if (file1 < file2) { + mutex_unlock(&file2->mut); + mutex_unlock(&file1->mut); + } else { + mutex_unlock(&file1->mut); + mutex_unlock(&file2->mut); + } +} + +static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) +{ + struct ucma_event *uevent, *tmp; + + list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &file->event_list); +} + +static ssize_t ucma_migrate_id(struct ucma_file *new_file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_migrate_id cmd; + struct rdma_ucm_migrate_resp resp; + struct ucma_context *ctx; + struct file *filp; + struct ucma_file *cur_file; + int ret = 0; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + /* Get current fd to protect against it being closed */ + filp = fget(cmd.fd); + if (!filp) + return -ENOENT; + + /* Validate current fd and prevent destruction of id. */ + ctx = ucma_get_ctx(filp->private_data, cmd.id); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); + goto file_put; + } + + cur_file = ctx->file; + if (cur_file == new_file) { + resp.events_reported = ctx->events_reported; + goto response; + } + + /* + * Migrate events between fd's, maintaining order, and avoiding new + * events being added before existing events. + */ + ucma_lock_files(cur_file, new_file); + mutex_lock(&mut); + + list_move_tail(&ctx->list, &new_file->ctx_list); + ucma_move_events(ctx, new_file); + ctx->file = new_file; + resp.events_reported = ctx->events_reported; + + mutex_unlock(&mut); + ucma_unlock_files(cur_file, new_file); + +response: + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + + ucma_put_ctx(ctx); +file_put: + fput(filp); + return ret; +} + static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { @@ -1012,6 +1101,7 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id }; static ssize_t ucma_write(struct file *filp, const char __user *buf, @@ -1056,6 +1146,14 @@ static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait) return mask; } +/* + * ucma_open() does not need the BKL: + * + * - no global state is referred to; + * - there is no ioctl method to race against; + * - no further module initialization is required for open to work + * after the device is registered. + */ static int ucma_open(struct inode *inode, struct file *filp) { struct ucma_file *file; diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 997c07d..9d74fb8 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ud_header.c 1349 2004-12-16 21:09:43Z roland $ */ #include <linux/errno.h> @@ -243,6 +241,36 @@ void ib_ud_header_init(int payload_bytes, EXPORT_SYMBOL(ib_ud_header_init); /** + * ib_lrh_header_pack - Pack LRH header struct into wire format + * @lrh:unpacked LRH header struct + * @buf:Buffer to pack into + * + * ib_lrh_header_pack() packs the LRH header structure @lrh into + * wire format in the buffer @buf. + */ +int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf) +{ + ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf); + return 0; +} +EXPORT_SYMBOL(ib_lrh_header_pack); + +/** + * ib_lrh_header_unpack - Unpack LRH structure from wire format + * @lrh:unpacked LRH header struct + * @buf:Buffer to pack into + * + * ib_lrh_header_unpack() unpacks the LRH header structure from + * wire format (in buf) into @lrh. + */ +int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh) +{ + ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh); + return 0; +} +EXPORT_SYMBOL(ib_lrh_header_unpack); + +/** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct * @buf:Buffer to pack into diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index dd4b7ad..bc0245c 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: uverbs_mem.c 2743 2005-06-28 22:27:59Z roland $ */ #include <linux/mm.h> @@ -42,10 +40,6 @@ #include "uverbs.h" -static int allow_weak_ordering; -module_param(allow_weak_ordering, bool, 0444); -MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory"); - #define IB_UMEM_MAX_PAGE_CHUNK \ ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ @@ -101,21 +95,14 @@ static void dma_unmap_sg_ia64(struct ib_device *ibdev, #endif -static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty, int dmasync) +static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { struct ib_umem_chunk *chunk, *tmp; int i; - DEFINE_DMA_ATTRS(attrs); - - if (dmasync) - dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); - else if (allow_weak_ordering) - dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); - list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg_attrs(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL, &attrs); + ib_dma_unmap_sg(dev, chunk->page_list, + chunk->nents, DMA_BIDIRECTIONAL); for (i = 0; i < chunk->nents; ++i) { struct page *page = sg_page(&chunk->page_list[i]); @@ -136,7 +123,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d * @access: IB_ACCESS_xxx flags for memory being pinned * @dmasync: flush in-flight DMA when the memory region is written */ -static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync) { struct ib_umem *umem; @@ -154,9 +141,6 @@ static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); - else if (allow_weak_ordering) - dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); - if (!can_do_mlock()) return ERR_PTR(-EPERM); @@ -214,7 +198,7 @@ static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long ret = 0; while (npages) { ret = get_user_pages(current, current->mm, cur_base, - min_t(int, npages, + min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), 1, !umem->writable, page_list, vma_list); @@ -268,7 +252,7 @@ static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long out: if (ret < 0) { - __ib_umem_release(context->device, umem, 0, dmasync); + __ib_umem_release(context->device, umem, 0); kfree(umem); } else current->mm->locked_vm = locked; @@ -280,21 +264,8 @@ out: return ret < 0 ? ERR_PTR(ret) : umem; } - -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access) -{ - return __ib_umem_get(context, addr, size, access, 0); -} EXPORT_SYMBOL(ib_umem_get); -struct ib_umem *ib_umem_get_dmasync(struct ib_ucontext *context, unsigned long addr, - size_t size, int access) -{ - return __ib_umem_get(context, addr, size, access, 1); -} -EXPORT_SYMBOL(ib_umem_get_dmasync); - static void ib_umem_account(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); @@ -310,13 +281,13 @@ static void ib_umem_account(struct work_struct *work) * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release */ -static void ib_umem_release_attr(struct ib_umem *umem, int dmasync) +void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; struct mm_struct *mm; unsigned long diff; - __ib_umem_release(umem->context->device, umem, 1, dmasync); + __ib_umem_release(umem->context->device, umem, 1); mm = get_task_mm(current); if (!mm) { @@ -351,19 +322,8 @@ static void ib_umem_release_attr(struct ib_umem *umem, int dmasync) mmput(mm); kfree(umem); } - -void ib_umem_release(struct ib_umem *umem) -{ - return ib_umem_release_attr(umem, 0); -} EXPORT_SYMBOL(ib_umem_release); -void ib_umem_release_dmasync(struct ib_umem *umem) -{ - return ib_umem_release_attr(umem, 1); -} -EXPORT_SYMBOL(ib_umem_release_dmasync); - int ib_umem_page_count(struct ib_umem *umem) { struct ib_umem_chunk *chunk; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index b53eac4..895b231 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -2,6 +2,7 @@ * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008 Cisco. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,8 +31,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: user_mad.c 5596 2006-03-03 01:00:07Z sean.hefty $ */ #include <linux/module.h> @@ -42,12 +41,12 @@ #include <linux/cdev.h> #include <linux/dma-mapping.h> #include <linux/poll.h> -#include <linux/rwsem.h> +#include <linux/mutex.h> #include <linux/kref.h> #include <linux/compat.h> +#include <linux/semaphore.h> #include <asm/uaccess.h> -#include <asm/semaphore.h> #include <rdma/ib_mad.h> #include <rdma/ib_user_mad.h> @@ -94,7 +93,7 @@ struct ib_umad_port { struct class_device *sm_class_dev; struct semaphore sm_sem; - struct rw_semaphore mutex; + struct mutex file_mutex; struct list_head file_list; struct ib_device *ib_dev; @@ -110,11 +109,11 @@ struct ib_umad_device { }; struct ib_umad_file { + struct mutex mutex; struct ib_umad_port *port; struct list_head recv_list; struct list_head send_list; struct list_head port_list; - spinlock_t recv_lock; spinlock_t send_lock; wait_queue_head_t recv_wait; struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; @@ -156,7 +155,7 @@ static int hdr_size(struct ib_umad_file *file) sizeof (struct ib_user_mad_hdr_old); } -/* caller must hold port->mutex at least for reading */ +/* caller must hold file->mutex */ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) { return file->agents_dead ? NULL : file->agent[id]; @@ -168,32 +167,30 @@ static int queue_packet(struct ib_umad_file *file, { int ret = 1; - down_read(&file->port->mutex); + mutex_lock(&file->mutex); for (packet->mad.hdr.id = 0; packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { - spin_lock_irq(&file->recv_lock); list_add_tail(&packet->list, &file->recv_list); - spin_unlock_irq(&file->recv_lock); wake_up_interruptible(&file->recv_wait); ret = 0; break; } - up_read(&file->port->mutex); + mutex_unlock(&file->mutex); return ret; } static void dequeue_send(struct ib_umad_file *file, struct ib_umad_packet *packet) - { +{ spin_lock_irq(&file->send_lock); list_del(&packet->list); spin_unlock_irq(&file->send_lock); - } +} static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *send_wc) @@ -341,10 +338,10 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, if (count < hdr_size(file)) return -EINVAL; - spin_lock_irq(&file->recv_lock); + mutex_lock(&file->mutex); while (list_empty(&file->recv_list)) { - spin_unlock_irq(&file->recv_lock); + mutex_unlock(&file->mutex); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; @@ -353,13 +350,13 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, !list_empty(&file->recv_list))) return -ERESTARTSYS; - spin_lock_irq(&file->recv_lock); + mutex_lock(&file->mutex); } packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); - spin_unlock_irq(&file->recv_lock); + mutex_unlock(&file->mutex); if (packet->recv_wc) ret = copy_recv_mad(file, buf, packet, count); @@ -368,9 +365,9 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, if (ret < 0) { /* Requeue packet */ - spin_lock_irq(&file->recv_lock); + mutex_lock(&file->mutex); list_add(&packet->list, &file->recv_list); - spin_unlock_irq(&file->recv_lock); + mutex_unlock(&file->mutex); } else { if (packet->recv_wc) ib_free_recv_mad(packet->recv_wc); @@ -481,7 +478,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err; } - down_read(&file->port->mutex); + mutex_lock(&file->mutex); agent = __get_agent(file, packet->mad.hdr.id); if (!agent) { @@ -577,7 +574,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, if (ret) goto err_send; - up_read(&file->port->mutex); + mutex_unlock(&file->mutex); return count; err_send: @@ -587,7 +584,7 @@ err_msg: err_ah: ib_destroy_ah(ah); err_up: - up_read(&file->port->mutex); + mutex_unlock(&file->mutex); err: kfree(packet); return ret; @@ -613,11 +610,12 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, { struct ib_user_mad_reg_req ureq; struct ib_mad_reg_req req; - struct ib_mad_agent *agent; + struct ib_mad_agent *agent = NULL; int agent_id; int ret; - down_write(&file->port->mutex); + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); if (!file->port->ib_dev) { ret = -EPIPE; @@ -666,13 +664,13 @@ found: send_handler, recv_handler, file); if (IS_ERR(agent)) { ret = PTR_ERR(agent); + agent = NULL; goto out; } if (put_user(agent_id, (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) { ret = -EFAULT; - ib_unregister_mad_agent(agent); goto out; } @@ -690,7 +688,13 @@ found: ret = 0; out: - up_write(&file->port->mutex); + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + return ret; } @@ -703,7 +707,8 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) if (get_user(id, arg)) return -EFAULT; - down_write(&file->port->mutex); + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { ret = -EINVAL; @@ -714,11 +719,13 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) file->agent[id] = NULL; out: - up_write(&file->port->mutex); + mutex_unlock(&file->mutex); if (agent) ib_unregister_mad_agent(agent); + mutex_unlock(&file->port->file_mutex); + return ret; } @@ -726,12 +733,12 @@ static long ib_umad_enable_pkey(struct ib_umad_file *file) { int ret = 0; - down_write(&file->port->mutex); + mutex_lock(&file->mutex); if (file->already_used) ret = -EINVAL; else file->use_pkey_index = 1; - up_write(&file->port->mutex); + mutex_unlock(&file->mutex); return ret; } @@ -768,6 +775,19 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, } #endif +/* + * ib_umad_open() does not need the BKL: + * + * - umad_port[] accesses are protected by port_lock, the + * ib_umad_port structures are properly reference counted, and + * everything else is purely local to the file being created, so + * races against other open calls are not a problem; + * - the ioctl method does not affect any global state outside of the + * file structure being operated on; + * - the port is added to umad_port[] as the last part of module + * initialization so the open method will either immediately run + * -ENXIO, or all required initialization will be done. + */ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; @@ -783,7 +803,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) if (!port) return -ENXIO; - down_write(&port->mutex); + mutex_lock(&port->file_mutex); if (!port->ib_dev) { ret = -ENXIO; @@ -797,7 +817,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) goto out; } - spin_lock_init(&file->recv_lock); + mutex_init(&file->mutex); spin_lock_init(&file->send_lock); INIT_LIST_HEAD(&file->recv_list); INIT_LIST_HEAD(&file->send_list); @@ -809,7 +829,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); out: - up_write(&port->mutex); + mutex_unlock(&port->file_mutex); return ret; } @@ -821,7 +841,8 @@ static int ib_umad_close(struct inode *inode, struct file *filp) int already_dead; int i; - down_write(&file->port->mutex); + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); already_dead = file->agents_dead; file->agents_dead = 1; @@ -834,14 +855,14 @@ static int ib_umad_close(struct inode *inode, struct file *filp) list_del(&file->port_list); - downgrade_write(&file->port->mutex); + mutex_unlock(&file->mutex); if (!already_dead) for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i) if (file->agent[i]) ib_unregister_mad_agent(file->agent[i]); - up_read(&file->port->mutex); + mutex_unlock(&file->port->file_mutex); kfree(file); kref_put(&dev->ref, ib_umad_release_dev); @@ -914,10 +935,10 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) }; int ret = 0; - down_write(&port->mutex); + mutex_lock(&port->file_mutex); if (port->ib_dev) ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - up_write(&port->mutex); + mutex_unlock(&port->file_mutex); up(&port->sm_sem); @@ -981,7 +1002,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->ib_dev = device; port->port_num = port_num; init_MUTEX(&port->sm_sem); - init_rwsem(&port->mutex); + mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); port->dev = cdev_alloc(); @@ -1052,6 +1073,7 @@ err_cdev: static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; + int already_dead; int id; class_set_devdata(port->class_dev, NULL); @@ -1067,42 +1089,22 @@ static void ib_umad_kill_port(struct ib_umad_port *port) umad_port[port->dev_num] = NULL; spin_unlock(&port_lock); - down_write(&port->mutex); + mutex_lock(&port->file_mutex); port->ib_dev = NULL; - /* - * Now go through the list of files attached to this port and - * unregister all of their MAD agents. We need to hold - * port->mutex while doing this to avoid racing with - * ib_umad_close(), but we can't hold the mutex for writing - * while calling ib_unregister_mad_agent(), since that might - * deadlock by calling back into queue_packet(). So we - * downgrade our lock to a read lock, and then drop and - * reacquire the write lock for the next iteration. - * - * We do list_del_init() on the file's list_head so that the - * list_del in ib_umad_close() is still OK, even after the - * file is removed from the list. - */ - while (!list_empty(&port->file_list)) { - file = list_entry(port->file_list.next, struct ib_umad_file, - port_list); - + list_for_each_entry(file, &port->file_list, port_list) { + mutex_lock(&file->mutex); + already_dead = file->agents_dead; file->agents_dead = 1; - list_del_init(&file->port_list); - - downgrade_write(&port->mutex); + mutex_unlock(&file->mutex); for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) if (file->agent[id]) ib_unregister_mad_agent(file->agent[id]); - - up_read(&port->mutex); - down_write(&port->mutex); } - up_write(&port->mutex); + mutex_unlock(&port->file_mutex); clear_bit(port->dev_num, dev_map); } diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index c75eb6c..d62fcad 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -32,8 +32,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: uverbs.h 2559 2005-06-06 19:43:16Z roland $ */ #ifndef UVERBS_H @@ -81,13 +79,13 @@ struct ib_uverbs_device { struct ib_uverbs_event_file { struct kref ref; - struct file *file; struct ib_uverbs_file *uverbs_file; spinlock_t lock; - int is_async; wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; + int is_async; + int is_closed; }; struct ib_uverbs_file { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9e98cec..56feab6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -31,8 +31,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: uverbs_cmd.c 2708 2005-06-24 17:27:21Z roland $ */ #include <linux/file.h> @@ -919,7 +917,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, resp->wc[i].opcode = wc[i].opcode; resp->wc[i].vendor_err = wc[i].vendor_err; resp->wc[i].byte_len = wc[i].byte_len; - resp->wc[i].imm_data = (__u32 __force) wc[i].imm_data; + resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data; resp->wc[i].qp_num = wc[i].qp->qp_num; resp->wc[i].src_qp = wc[i].src_qp; resp->wc[i].wc_flags = wc[i].wc_flags; @@ -1463,7 +1461,6 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->num_sge = user_wr->num_sge; next->opcode = user_wr->opcode; next->send_flags = user_wr->send_flags; - next->imm_data = (__be32 __force) user_wr->imm_data; if (is_ud) { next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, @@ -1476,14 +1473,24 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; } else { switch (next->opcode) { - case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + case IB_WR_RDMA_WRITE: case IB_WR_RDMA_READ: next->wr.rdma.remote_addr = user_wr->wr.rdma.remote_addr; next->wr.rdma.rkey = user_wr->wr.rdma.rkey; break; + case IB_WR_SEND_WITH_IMM: + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + next->ex.invalidate_rkey = + user_wr->ex.invalidate_rkey; + break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: next->wr.atomic.remote_addr = diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index cfa567b..68b75e7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -32,8 +32,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: uverbs_main.c 2733 2005-06-28 19:14:34Z roland $ */ #include <linux/module.h> @@ -352,7 +350,7 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp) struct ib_uverbs_event *entry, *tmp; spin_lock_irq(&file->lock); - file->file = NULL; + file->is_closed = 1; list_for_each_entry_safe(entry, tmp, &file->event_list, list) { if (entry->counter) list_del(&entry->obj_list); @@ -390,7 +388,7 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) return; spin_lock_irqsave(&file->lock, flags); - if (!file->file) { + if (file->is_closed) { spin_unlock_irqrestore(&file->lock, flags); return; } @@ -423,7 +421,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, unsigned long flags; spin_lock_irqsave(&file->async_file->lock, flags); - if (!file->async_file->file) { + if (file->async_file->is_closed) { spin_unlock_irqrestore(&file->async_file->lock, flags); return; } @@ -509,6 +507,7 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, ev_file->uverbs_file = uverbs_file; ev_file->async_queue = NULL; ev_file->is_async = is_async; + ev_file->is_closed = 0; *fd = get_unused_fd(); if (*fd < 0) { @@ -522,8 +521,6 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, goto err_fd; } - ev_file->file = filp; - /* * fops_get() can't fail here, because we're coming from a * system call on a uverbs file, which will already have a @@ -616,6 +613,18 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) return file->device->ib_dev->mmap(file->ucontext, vma); } +/* + * ib_uverbs_open() does not need the BKL: + * + * - dev_table[] accesses are protected by map_lock, the + * ib_uverbs_device structures are properly reference counted, and + * everything else is purely local to the file being created, so + * races against other open calls are not a problem; + * - there is no ioctl method to race against; + * - the device is added to dev_table[] as the last part of module + * initialization, the open method will either immediately run + * -ENXIO, or all required initialization will be done. + */ static int ib_uverbs_open(struct inode *inode, struct file *filp) { struct ib_uverbs_device *dev; @@ -657,7 +666,6 @@ err_module: err: kref_put(&dev->ref, ib_uverbs_release_dev); - return ret; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 84709ed..a7da9be 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -34,8 +34,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $ */ #include <linux/errno.h> @@ -248,7 +246,9 @@ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask) { - return srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL); + return srq->device->modify_srq ? + srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : + -ENOSYS; } EXPORT_SYMBOL(ib_modify_srq); @@ -315,7 +315,6 @@ static const struct { } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { @@ -679,6 +678,9 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, { struct ib_mr *mr; + if (!pd->device->reg_phys_mr) + return ERR_PTR(-ENOSYS); + mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, mr_access_flags, iova_start); @@ -750,6 +752,52 @@ int ib_dereg_mr(struct ib_mr *mr) } EXPORT_SYMBOL(ib_dereg_mr); +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +{ + struct ib_mr *mr; + + if (!pd->device->alloc_fast_reg_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_alloc_fast_reg_mr); + +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, + int max_page_list_len) +{ + struct ib_fast_reg_page_list *page_list; + + if (!device->alloc_fast_reg_page_list) + return ERR_PTR(-ENOSYS); + + page_list = device->alloc_fast_reg_page_list(device, max_page_list_len); + + if (!IS_ERR(page_list)) { + page_list->device = device; + page_list->max_page_list_len = max_page_list_len; + } + + return page_list; +} +EXPORT_SYMBOL(ib_alloc_fast_reg_page_list); + +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + page_list->device->free_fast_reg_page_list(page_list); +} +EXPORT_SYMBOL(ib_free_fast_reg_page_list); + /* Memory windows */ struct ib_mw *ib_alloc_mw(struct ib_pd *pd) diff --git a/drivers/infiniband/core/writeback.c b/drivers/infiniband/core/writeback.c new file mode 100644 index 0000000..7f8ebbe --- /dev/null +++ b/drivers/infiniband/core/writeback.c @@ -0,0 +1,106 @@ +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/writeback.h> +#include <linux/mpage.h> +#include <linux/module.h> + +int backport_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, backport_writepage_t writepage, + void *data) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + long nr_to_write = wbc->nr_to_write; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || (--nr_to_write <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} +EXPORT_SYMBOL(backport_write_cache_pages);