af_rds.c (23889B)
1/* 2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33#include <linux/module.h> 34#include <linux/errno.h> 35#include <linux/kernel.h> 36#include <linux/gfp.h> 37#include <linux/in.h> 38#include <linux/ipv6.h> 39#include <linux/poll.h> 40#include <net/sock.h> 41 42#include "rds.h" 43 44/* this is just used for stats gathering :/ */ 45static DEFINE_SPINLOCK(rds_sock_lock); 46static unsigned long rds_sock_count; 47static LIST_HEAD(rds_sock_list); 48DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); 49 50/* 51 * This is called as the final descriptor referencing this socket is closed. 52 * We have to unbind the socket so that another socket can be bound to the 53 * address it was using. 54 * 55 * We have to be careful about racing with the incoming path. sock_orphan() 56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 57 * messages shouldn't be queued. 58 */ 59static int rds_release(struct socket *sock) 60{ 61 struct sock *sk = sock->sk; 62 struct rds_sock *rs; 63 64 if (!sk) 65 goto out; 66 67 rs = rds_sk_to_rs(sk); 68 69 sock_orphan(sk); 70 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so 71 * that ensures the recv path has completed messing 72 * with the socket. */ 73 rds_clear_recv_queue(rs); 74 rds_cong_remove_socket(rs); 75 76 rds_remove_bound(rs); 77 78 rds_send_drop_to(rs, NULL); 79 rds_rdma_drop_keys(rs); 80 rds_notify_queue_get(rs, NULL); 81 rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); 82 83 spin_lock_bh(&rds_sock_lock); 84 list_del_init(&rs->rs_item); 85 rds_sock_count--; 86 spin_unlock_bh(&rds_sock_lock); 87 88 rds_trans_put(rs->rs_transport); 89 90 sock->sk = NULL; 91 sock_put(sk); 92out: 93 return 0; 94} 95 96/* 97 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. 98 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 99 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 100 * this seems more conservative. 101 * NB - normally, one would use sk_callback_lock for this, but we can 102 * get here from interrupts, whereas the network code grabs sk_callback_lock 103 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 104 */ 105void rds_wake_sk_sleep(struct rds_sock *rs) 106{ 107 unsigned long flags; 108 109 read_lock_irqsave(&rs->rs_recv_lock, flags); 110 __rds_wake_sk_sleep(rds_rs_to_sk(rs)); 111 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 112} 113 114static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 115 int peer) 116{ 117 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 118 struct sockaddr_in6 *sin6; 119 struct sockaddr_in *sin; 120 int uaddr_len; 121 122 /* racey, don't care */ 123 if (peer) { 124 if (ipv6_addr_any(&rs->rs_conn_addr)) 125 return -ENOTCONN; 126 127 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { 128 sin = (struct sockaddr_in *)uaddr; 129 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 130 sin->sin_family = AF_INET; 131 sin->sin_port = rs->rs_conn_port; 132 sin->sin_addr.s_addr = rs->rs_conn_addr_v4; 133 uaddr_len = sizeof(*sin); 134 } else { 135 sin6 = (struct sockaddr_in6 *)uaddr; 136 sin6->sin6_family = AF_INET6; 137 sin6->sin6_port = rs->rs_conn_port; 138 sin6->sin6_addr = rs->rs_conn_addr; 139 sin6->sin6_flowinfo = 0; 140 /* scope_id is the same as in the bound address. */ 141 sin6->sin6_scope_id = rs->rs_bound_scope_id; 142 uaddr_len = sizeof(*sin6); 143 } 144 } else { 145 /* If socket is not yet bound and the socket is connected, 146 * set the return address family to be the same as the 147 * connected address, but with 0 address value. If it is not 148 * connected, set the family to be AF_UNSPEC (value 0) and 149 * the address size to be that of an IPv4 address. 150 */ 151 if (ipv6_addr_any(&rs->rs_bound_addr)) { 152 if (ipv6_addr_any(&rs->rs_conn_addr)) { 153 sin = (struct sockaddr_in *)uaddr; 154 memset(sin, 0, sizeof(*sin)); 155 sin->sin_family = AF_UNSPEC; 156 return sizeof(*sin); 157 } 158 159#if IS_ENABLED(CONFIG_IPV6) 160 if (!(ipv6_addr_type(&rs->rs_conn_addr) & 161 IPV6_ADDR_MAPPED)) { 162 sin6 = (struct sockaddr_in6 *)uaddr; 163 memset(sin6, 0, sizeof(*sin6)); 164 sin6->sin6_family = AF_INET6; 165 return sizeof(*sin6); 166 } 167#endif 168 169 sin = (struct sockaddr_in *)uaddr; 170 memset(sin, 0, sizeof(*sin)); 171 sin->sin_family = AF_INET; 172 return sizeof(*sin); 173 } 174 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 175 sin = (struct sockaddr_in *)uaddr; 176 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 177 sin->sin_family = AF_INET; 178 sin->sin_port = rs->rs_bound_port; 179 sin->sin_addr.s_addr = rs->rs_bound_addr_v4; 180 uaddr_len = sizeof(*sin); 181 } else { 182 sin6 = (struct sockaddr_in6 *)uaddr; 183 sin6->sin6_family = AF_INET6; 184 sin6->sin6_port = rs->rs_bound_port; 185 sin6->sin6_addr = rs->rs_bound_addr; 186 sin6->sin6_flowinfo = 0; 187 sin6->sin6_scope_id = rs->rs_bound_scope_id; 188 uaddr_len = sizeof(*sin6); 189 } 190 } 191 192 return uaddr_len; 193} 194 195/* 196 * RDS' poll is without a doubt the least intuitive part of the interface, 197 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from 198 * a network protocol. 199 * 200 * EPOLLIN is asserted if 201 * - there is data on the receive queue. 202 * - to signal that a previously congested destination may have become 203 * uncongested 204 * - A notification has been queued to the socket (this can be a congestion 205 * update, or a RDMA completion, or a MSG_ZEROCOPY completion). 206 * 207 * EPOLLOUT is asserted if there is room on the send queue. This does not mean 208 * however, that the next sendmsg() call will succeed. If the application tries 209 * to send to a congested destination, the system call may still fail (and 210 * return ENOBUFS). 211 */ 212static __poll_t rds_poll(struct file *file, struct socket *sock, 213 poll_table *wait) 214{ 215 struct sock *sk = sock->sk; 216 struct rds_sock *rs = rds_sk_to_rs(sk); 217 __poll_t mask = 0; 218 unsigned long flags; 219 220 poll_wait(file, sk_sleep(sk), wait); 221 222 if (rs->rs_seen_congestion) 223 poll_wait(file, &rds_poll_waitq, wait); 224 225 read_lock_irqsave(&rs->rs_recv_lock, flags); 226 if (!rs->rs_cong_monitor) { 227 /* When a congestion map was updated, we signal EPOLLIN for 228 * "historical" reasons. Applications can also poll for 229 * WRBAND instead. */ 230 if (rds_cong_updated_since(&rs->rs_cong_track)) 231 mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); 232 } else { 233 spin_lock(&rs->rs_lock); 234 if (rs->rs_cong_notify) 235 mask |= (EPOLLIN | EPOLLRDNORM); 236 spin_unlock(&rs->rs_lock); 237 } 238 if (!list_empty(&rs->rs_recv_queue) || 239 !list_empty(&rs->rs_notify_queue) || 240 !list_empty(&rs->rs_zcookie_queue.zcookie_head)) 241 mask |= (EPOLLIN | EPOLLRDNORM); 242 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) 243 mask |= (EPOLLOUT | EPOLLWRNORM); 244 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 245 mask |= POLLERR; 246 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 247 248 /* clear state any time we wake a seen-congested socket */ 249 if (mask) 250 rs->rs_seen_congestion = 0; 251 252 return mask; 253} 254 255static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 256{ 257 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 258 rds_tos_t utos, tos = 0; 259 260 switch (cmd) { 261 case SIOCRDSSETTOS: 262 if (get_user(utos, (rds_tos_t __user *)arg)) 263 return -EFAULT; 264 265 if (rs->rs_transport && 266 rs->rs_transport->get_tos_map) 267 tos = rs->rs_transport->get_tos_map(utos); 268 else 269 return -ENOIOCTLCMD; 270 271 spin_lock_bh(&rds_sock_lock); 272 if (rs->rs_tos || rs->rs_conn) { 273 spin_unlock_bh(&rds_sock_lock); 274 return -EINVAL; 275 } 276 rs->rs_tos = tos; 277 spin_unlock_bh(&rds_sock_lock); 278 break; 279 case SIOCRDSGETTOS: 280 spin_lock_bh(&rds_sock_lock); 281 tos = rs->rs_tos; 282 spin_unlock_bh(&rds_sock_lock); 283 if (put_user(tos, (rds_tos_t __user *)arg)) 284 return -EFAULT; 285 break; 286 default: 287 return -ENOIOCTLCMD; 288 } 289 290 return 0; 291} 292 293static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) 294{ 295 struct sockaddr_in6 sin6; 296 struct sockaddr_in sin; 297 int ret = 0; 298 299 /* racing with another thread binding seems ok here */ 300 if (ipv6_addr_any(&rs->rs_bound_addr)) { 301 ret = -ENOTCONN; /* XXX not a great errno */ 302 goto out; 303 } 304 305 if (len < sizeof(struct sockaddr_in)) { 306 ret = -EINVAL; 307 goto out; 308 } else if (len < sizeof(struct sockaddr_in6)) { 309 /* Assume IPv4 */ 310 if (copy_from_sockptr(&sin, optval, 311 sizeof(struct sockaddr_in))) { 312 ret = -EFAULT; 313 goto out; 314 } 315 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); 316 sin6.sin6_port = sin.sin_port; 317 } else { 318 if (copy_from_sockptr(&sin6, optval, 319 sizeof(struct sockaddr_in6))) { 320 ret = -EFAULT; 321 goto out; 322 } 323 } 324 325 rds_send_drop_to(rs, &sin6); 326out: 327 return ret; 328} 329 330static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, 331 int optlen) 332{ 333 int value; 334 335 if (optlen < sizeof(int)) 336 return -EINVAL; 337 if (copy_from_sockptr(&value, optval, sizeof(int))) 338 return -EFAULT; 339 *optvar = !!value; 340 return 0; 341} 342 343static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) 344{ 345 int ret; 346 347 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 348 if (ret == 0) { 349 if (rs->rs_cong_monitor) { 350 rds_cong_add_socket(rs); 351 } else { 352 rds_cong_remove_socket(rs); 353 rs->rs_cong_mask = 0; 354 rs->rs_cong_notify = 0; 355 } 356 } 357 return ret; 358} 359 360static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen) 361{ 362 int t_type; 363 364 if (rs->rs_transport) 365 return -EOPNOTSUPP; /* previously attached to transport */ 366 367 if (optlen != sizeof(int)) 368 return -EINVAL; 369 370 if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) 371 return -EFAULT; 372 373 if (t_type < 0 || t_type >= RDS_TRANS_COUNT) 374 return -EINVAL; 375 376 rs->rs_transport = rds_trans_get(t_type); 377 378 return rs->rs_transport ? 0 : -ENOPROTOOPT; 379} 380 381static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, 382 int optlen, int optname) 383{ 384 int val, valbool; 385 386 if (optlen != sizeof(int)) 387 return -EFAULT; 388 389 if (copy_from_sockptr(&val, optval, sizeof(int))) 390 return -EFAULT; 391 392 valbool = val ? 1 : 0; 393 394 if (optname == SO_TIMESTAMP_NEW) 395 sock_set_flag(sk, SOCK_TSTAMP_NEW); 396 397 if (valbool) 398 sock_set_flag(sk, SOCK_RCVTSTAMP); 399 else 400 sock_reset_flag(sk, SOCK_RCVTSTAMP); 401 402 return 0; 403} 404 405static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, 406 int optlen) 407{ 408 struct rds_rx_trace_so trace; 409 int i; 410 411 if (optlen != sizeof(struct rds_rx_trace_so)) 412 return -EFAULT; 413 414 if (copy_from_sockptr(&trace, optval, sizeof(trace))) 415 return -EFAULT; 416 417 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) 418 return -EFAULT; 419 420 rs->rs_rx_traces = trace.rx_traces; 421 for (i = 0; i < rs->rs_rx_traces; i++) { 422 if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { 423 rs->rs_rx_traces = 0; 424 return -EFAULT; 425 } 426 rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; 427 } 428 429 return 0; 430} 431 432static int rds_setsockopt(struct socket *sock, int level, int optname, 433 sockptr_t optval, unsigned int optlen) 434{ 435 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 436 int ret; 437 438 if (level != SOL_RDS) { 439 ret = -ENOPROTOOPT; 440 goto out; 441 } 442 443 switch (optname) { 444 case RDS_CANCEL_SENT_TO: 445 ret = rds_cancel_sent_to(rs, optval, optlen); 446 break; 447 case RDS_GET_MR: 448 ret = rds_get_mr(rs, optval, optlen); 449 break; 450 case RDS_GET_MR_FOR_DEST: 451 ret = rds_get_mr_for_dest(rs, optval, optlen); 452 break; 453 case RDS_FREE_MR: 454 ret = rds_free_mr(rs, optval, optlen); 455 break; 456 case RDS_RECVERR: 457 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); 458 break; 459 case RDS_CONG_MONITOR: 460 ret = rds_cong_monitor(rs, optval, optlen); 461 break; 462 case SO_RDS_TRANSPORT: 463 lock_sock(sock->sk); 464 ret = rds_set_transport(rs, optval, optlen); 465 release_sock(sock->sk); 466 break; 467 case SO_TIMESTAMP_OLD: 468 case SO_TIMESTAMP_NEW: 469 lock_sock(sock->sk); 470 ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); 471 release_sock(sock->sk); 472 break; 473 case SO_RDS_MSG_RXPATH_LATENCY: 474 ret = rds_recv_track_latency(rs, optval, optlen); 475 break; 476 default: 477 ret = -ENOPROTOOPT; 478 } 479out: 480 return ret; 481} 482 483static int rds_getsockopt(struct socket *sock, int level, int optname, 484 char __user *optval, int __user *optlen) 485{ 486 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 487 int ret = -ENOPROTOOPT, len; 488 int trans; 489 490 if (level != SOL_RDS) 491 goto out; 492 493 if (get_user(len, optlen)) { 494 ret = -EFAULT; 495 goto out; 496 } 497 498 switch (optname) { 499 case RDS_INFO_FIRST ... RDS_INFO_LAST: 500 ret = rds_info_getsockopt(sock, optname, optval, 501 optlen); 502 break; 503 504 case RDS_RECVERR: 505 if (len < sizeof(int)) 506 ret = -EINVAL; 507 else 508 if (put_user(rs->rs_recverr, (int __user *) optval) || 509 put_user(sizeof(int), optlen)) 510 ret = -EFAULT; 511 else 512 ret = 0; 513 break; 514 case SO_RDS_TRANSPORT: 515 if (len < sizeof(int)) { 516 ret = -EINVAL; 517 break; 518 } 519 trans = (rs->rs_transport ? rs->rs_transport->t_type : 520 RDS_TRANS_NONE); /* unbound */ 521 if (put_user(trans, (int __user *)optval) || 522 put_user(sizeof(int), optlen)) 523 ret = -EFAULT; 524 else 525 ret = 0; 526 break; 527 default: 528 break; 529 } 530 531out: 532 return ret; 533 534} 535 536static int rds_connect(struct socket *sock, struct sockaddr *uaddr, 537 int addr_len, int flags) 538{ 539 struct sock *sk = sock->sk; 540 struct sockaddr_in *sin; 541 struct rds_sock *rs = rds_sk_to_rs(sk); 542 int ret = 0; 543 544 if (addr_len < offsetofend(struct sockaddr, sa_family)) 545 return -EINVAL; 546 547 lock_sock(sk); 548 549 switch (uaddr->sa_family) { 550 case AF_INET: 551 sin = (struct sockaddr_in *)uaddr; 552 if (addr_len < sizeof(struct sockaddr_in)) { 553 ret = -EINVAL; 554 break; 555 } 556 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 557 ret = -EDESTADDRREQ; 558 break; 559 } 560 if (ipv4_is_multicast(sin->sin_addr.s_addr) || 561 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { 562 ret = -EINVAL; 563 break; 564 } 565 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); 566 rs->rs_conn_port = sin->sin_port; 567 break; 568 569#if IS_ENABLED(CONFIG_IPV6) 570 case AF_INET6: { 571 struct sockaddr_in6 *sin6; 572 int addr_type; 573 574 sin6 = (struct sockaddr_in6 *)uaddr; 575 if (addr_len < sizeof(struct sockaddr_in6)) { 576 ret = -EINVAL; 577 break; 578 } 579 addr_type = ipv6_addr_type(&sin6->sin6_addr); 580 if (!(addr_type & IPV6_ADDR_UNICAST)) { 581 __be32 addr4; 582 583 if (!(addr_type & IPV6_ADDR_MAPPED)) { 584 ret = -EPROTOTYPE; 585 break; 586 } 587 588 /* It is a mapped address. Need to do some sanity 589 * checks. 590 */ 591 addr4 = sin6->sin6_addr.s6_addr32[3]; 592 if (addr4 == htonl(INADDR_ANY) || 593 addr4 == htonl(INADDR_BROADCAST) || 594 ipv4_is_multicast(addr4)) { 595 ret = -EPROTOTYPE; 596 break; 597 } 598 } 599 600 if (addr_type & IPV6_ADDR_LINKLOCAL) { 601 /* If socket is arleady bound to a link local address, 602 * the peer address must be on the same link. 603 */ 604 if (sin6->sin6_scope_id == 0 || 605 (!ipv6_addr_any(&rs->rs_bound_addr) && 606 rs->rs_bound_scope_id && 607 sin6->sin6_scope_id != rs->rs_bound_scope_id)) { 608 ret = -EINVAL; 609 break; 610 } 611 /* Remember the connected address scope ID. It will 612 * be checked against the binding local address when 613 * the socket is bound. 614 */ 615 rs->rs_bound_scope_id = sin6->sin6_scope_id; 616 } 617 rs->rs_conn_addr = sin6->sin6_addr; 618 rs->rs_conn_port = sin6->sin6_port; 619 break; 620 } 621#endif 622 623 default: 624 ret = -EAFNOSUPPORT; 625 break; 626 } 627 628 release_sock(sk); 629 return ret; 630} 631 632static struct proto rds_proto = { 633 .name = "RDS", 634 .owner = THIS_MODULE, 635 .obj_size = sizeof(struct rds_sock), 636}; 637 638static const struct proto_ops rds_proto_ops = { 639 .family = AF_RDS, 640 .owner = THIS_MODULE, 641 .release = rds_release, 642 .bind = rds_bind, 643 .connect = rds_connect, 644 .socketpair = sock_no_socketpair, 645 .accept = sock_no_accept, 646 .getname = rds_getname, 647 .poll = rds_poll, 648 .ioctl = rds_ioctl, 649 .listen = sock_no_listen, 650 .shutdown = sock_no_shutdown, 651 .setsockopt = rds_setsockopt, 652 .getsockopt = rds_getsockopt, 653 .sendmsg = rds_sendmsg, 654 .recvmsg = rds_recvmsg, 655 .mmap = sock_no_mmap, 656 .sendpage = sock_no_sendpage, 657}; 658 659static void rds_sock_destruct(struct sock *sk) 660{ 661 struct rds_sock *rs = rds_sk_to_rs(sk); 662 663 WARN_ON((&rs->rs_item != rs->rs_item.next || 664 &rs->rs_item != rs->rs_item.prev)); 665} 666 667static int __rds_create(struct socket *sock, struct sock *sk, int protocol) 668{ 669 struct rds_sock *rs; 670 671 sock_init_data(sock, sk); 672 sock->ops = &rds_proto_ops; 673 sk->sk_protocol = protocol; 674 sk->sk_destruct = rds_sock_destruct; 675 676 rs = rds_sk_to_rs(sk); 677 spin_lock_init(&rs->rs_lock); 678 rwlock_init(&rs->rs_recv_lock); 679 INIT_LIST_HEAD(&rs->rs_send_queue); 680 INIT_LIST_HEAD(&rs->rs_recv_queue); 681 INIT_LIST_HEAD(&rs->rs_notify_queue); 682 INIT_LIST_HEAD(&rs->rs_cong_list); 683 rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); 684 spin_lock_init(&rs->rs_rdma_lock); 685 rs->rs_rdma_keys = RB_ROOT; 686 rs->rs_rx_traces = 0; 687 rs->rs_tos = 0; 688 rs->rs_conn = NULL; 689 690 spin_lock_bh(&rds_sock_lock); 691 list_add_tail(&rs->rs_item, &rds_sock_list); 692 rds_sock_count++; 693 spin_unlock_bh(&rds_sock_lock); 694 695 return 0; 696} 697 698static int rds_create(struct net *net, struct socket *sock, int protocol, 699 int kern) 700{ 701 struct sock *sk; 702 703 if (sock->type != SOCK_SEQPACKET || protocol) 704 return -ESOCKTNOSUPPORT; 705 706 sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); 707 if (!sk) 708 return -ENOMEM; 709 710 return __rds_create(sock, sk, protocol); 711} 712 713void rds_sock_addref(struct rds_sock *rs) 714{ 715 sock_hold(rds_rs_to_sk(rs)); 716} 717 718void rds_sock_put(struct rds_sock *rs) 719{ 720 sock_put(rds_rs_to_sk(rs)); 721} 722 723static const struct net_proto_family rds_family_ops = { 724 .family = AF_RDS, 725 .create = rds_create, 726 .owner = THIS_MODULE, 727}; 728 729static void rds_sock_inc_info(struct socket *sock, unsigned int len, 730 struct rds_info_iterator *iter, 731 struct rds_info_lengths *lens) 732{ 733 struct rds_sock *rs; 734 struct rds_incoming *inc; 735 unsigned int total = 0; 736 737 len /= sizeof(struct rds_info_message); 738 739 spin_lock_bh(&rds_sock_lock); 740 741 list_for_each_entry(rs, &rds_sock_list, rs_item) { 742 /* This option only supports IPv4 sockets. */ 743 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 744 continue; 745 746 read_lock(&rs->rs_recv_lock); 747 748 /* XXX too lazy to maintain counts.. */ 749 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 750 total++; 751 if (total <= len) 752 rds_inc_info_copy(inc, iter, 753 inc->i_saddr.s6_addr32[3], 754 rs->rs_bound_addr_v4, 755 1); 756 } 757 758 read_unlock(&rs->rs_recv_lock); 759 } 760 761 spin_unlock_bh(&rds_sock_lock); 762 763 lens->nr = total; 764 lens->each = sizeof(struct rds_info_message); 765} 766 767#if IS_ENABLED(CONFIG_IPV6) 768static void rds6_sock_inc_info(struct socket *sock, unsigned int len, 769 struct rds_info_iterator *iter, 770 struct rds_info_lengths *lens) 771{ 772 struct rds_incoming *inc; 773 unsigned int total = 0; 774 struct rds_sock *rs; 775 776 len /= sizeof(struct rds6_info_message); 777 778 spin_lock_bh(&rds_sock_lock); 779 780 list_for_each_entry(rs, &rds_sock_list, rs_item) { 781 read_lock(&rs->rs_recv_lock); 782 783 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 784 total++; 785 if (total <= len) 786 rds6_inc_info_copy(inc, iter, &inc->i_saddr, 787 &rs->rs_bound_addr, 1); 788 } 789 790 read_unlock(&rs->rs_recv_lock); 791 } 792 793 spin_unlock_bh(&rds_sock_lock); 794 795 lens->nr = total; 796 lens->each = sizeof(struct rds6_info_message); 797} 798#endif 799 800static void rds_sock_info(struct socket *sock, unsigned int len, 801 struct rds_info_iterator *iter, 802 struct rds_info_lengths *lens) 803{ 804 struct rds_info_socket sinfo; 805 unsigned int cnt = 0; 806 struct rds_sock *rs; 807 808 len /= sizeof(struct rds_info_socket); 809 810 spin_lock_bh(&rds_sock_lock); 811 812 if (len < rds_sock_count) { 813 cnt = rds_sock_count; 814 goto out; 815 } 816 817 list_for_each_entry(rs, &rds_sock_list, rs_item) { 818 /* This option only supports IPv4 sockets. */ 819 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 820 continue; 821 sinfo.sndbuf = rds_sk_sndbuf(rs); 822 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 823 sinfo.bound_addr = rs->rs_bound_addr_v4; 824 sinfo.connected_addr = rs->rs_conn_addr_v4; 825 sinfo.bound_port = rs->rs_bound_port; 826 sinfo.connected_port = rs->rs_conn_port; 827 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 828 829 rds_info_copy(iter, &sinfo, sizeof(sinfo)); 830 cnt++; 831 } 832 833out: 834 lens->nr = cnt; 835 lens->each = sizeof(struct rds_info_socket); 836 837 spin_unlock_bh(&rds_sock_lock); 838} 839 840#if IS_ENABLED(CONFIG_IPV6) 841static void rds6_sock_info(struct socket *sock, unsigned int len, 842 struct rds_info_iterator *iter, 843 struct rds_info_lengths *lens) 844{ 845 struct rds6_info_socket sinfo6; 846 struct rds_sock *rs; 847 848 len /= sizeof(struct rds6_info_socket); 849 850 spin_lock_bh(&rds_sock_lock); 851 852 if (len < rds_sock_count) 853 goto out; 854 855 list_for_each_entry(rs, &rds_sock_list, rs_item) { 856 sinfo6.sndbuf = rds_sk_sndbuf(rs); 857 sinfo6.rcvbuf = rds_sk_rcvbuf(rs); 858 sinfo6.bound_addr = rs->rs_bound_addr; 859 sinfo6.connected_addr = rs->rs_conn_addr; 860 sinfo6.bound_port = rs->rs_bound_port; 861 sinfo6.connected_port = rs->rs_conn_port; 862 sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); 863 864 rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); 865 } 866 867 out: 868 lens->nr = rds_sock_count; 869 lens->each = sizeof(struct rds6_info_socket); 870 871 spin_unlock_bh(&rds_sock_lock); 872} 873#endif 874 875static void rds_exit(void) 876{ 877 sock_unregister(rds_family_ops.family); 878 proto_unregister(&rds_proto); 879 rds_conn_exit(); 880 rds_cong_exit(); 881 rds_sysctl_exit(); 882 rds_threads_exit(); 883 rds_stats_exit(); 884 rds_page_exit(); 885 rds_bind_lock_destroy(); 886 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); 887 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 888#if IS_ENABLED(CONFIG_IPV6) 889 rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); 890 rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 891#endif 892} 893module_exit(rds_exit); 894 895u32 rds_gen_num; 896 897static int rds_init(void) 898{ 899 int ret; 900 901 net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); 902 903 ret = rds_bind_lock_init(); 904 if (ret) 905 goto out; 906 907 ret = rds_conn_init(); 908 if (ret) 909 goto out_bind; 910 911 ret = rds_threads_init(); 912 if (ret) 913 goto out_conn; 914 ret = rds_sysctl_init(); 915 if (ret) 916 goto out_threads; 917 ret = rds_stats_init(); 918 if (ret) 919 goto out_sysctl; 920 ret = proto_register(&rds_proto, 1); 921 if (ret) 922 goto out_stats; 923 ret = sock_register(&rds_family_ops); 924 if (ret) 925 goto out_proto; 926 927 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); 928 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 929#if IS_ENABLED(CONFIG_IPV6) 930 rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); 931 rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 932#endif 933 934 goto out; 935 936out_proto: 937 proto_unregister(&rds_proto); 938out_stats: 939 rds_stats_exit(); 940out_sysctl: 941 rds_sysctl_exit(); 942out_threads: 943 rds_threads_exit(); 944out_conn: 945 rds_conn_exit(); 946 rds_cong_exit(); 947 rds_page_exit(); 948out_bind: 949 rds_bind_lock_destroy(); 950out: 951 return ret; 952} 953module_init(rds_init); 954 955#define DRV_VERSION "4.0" 956#define DRV_RELDATE "Feb 12, 2009" 957 958MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 959MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" 960 " v" DRV_VERSION " (" DRV_RELDATE ")"); 961MODULE_VERSION(DRV_VERSION); 962MODULE_LICENSE("Dual BSD/GPL"); 963MODULE_ALIAS_NETPROTO(PF_RDS);