读Linux内核(4.9.9)之bind系统调用 Linux内核阅读

在调用socket成功返回后，我们得到与socket关联的文件描述符。然后我们以该描述符和sockaddr地址结构对象为参数调用bind，就实现了socket对象地址的绑定。那这个绑定到底是个什么意思？这个绑定操作是必须吗？绑定操作之后，socket对象又发生了什么？也许还有更多的疑问，我们在协议栈的源码寻找答案。先贴上bind系统调用的源码：

/* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. * * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; //根据文件描述符获得file对象,然后从private_data成员获得socket对象 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { //从用户空间拷贝地址信息到内核 err = move_addr_to_kernel(umyaddr, addrlen, &address); if (err >= 0) { err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); //sock->ops在 inet_create 初始化 //在inetsw_array数组有套接字类型和协议的对应描述对象//对于流式套接字,sock->ops为 inet_stream_ops//对于数据报套接字,sock->ops为 inet_dgram_ops //sock->ops->bind为 inet_bind//对于tcp实质是将sock对象放到哈希表中,对于tcp协议,则放到 tcp_hashinfo 的bhashif (!err) err = sock->ops->bind(sock, (struct sockaddr *) &address, addrlen); } fput_light(sock->file, fput_needed); } return err; }

通过传入的文件描述符，我们找到对应的file对象，然后通过file对象的private_data指针找到socket对象。接着将包含地址信息的sockaddr对象从用户空间拷贝到内核空间，接着是安全方面的东西，接着就是我们关注的重点：

if (!err) err = sock->ops->bind(sock, (struct sockaddr *) &address, addrlen);

我们在执行socket系统调用时，会传入套接字的类型（流式套接字SOCK_STREAM，数据包套接字SOCK_DGRAM，原生套接字SOCK_RAW），根据该类型，我们会将相应类型的操作函数集指针，以及协议描述块指针赋值给socket对象，如对于流式套接字，操作函数集为inet_stream_ops，实现协议为描述块为tcp_prot：

static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { ... //根据套接字类型sock->type得到协议的关联对象 //套接字类型与协议的关联对象数组定义在inetsw_array,inet_init函数中会加载到inetsw拉链表 //其实如果只有inet协议族,则这个链表只有一个对象 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; }...//inetsw_array//特定协议的操作函数集，声明在net.h //对于tcp: inet_stream_ops sock->ops = answer->ops; //INET层协议描述块 //对于tcp, tcp_prot answer_prot = answer->prot; answer_flags = answer->flags; ...}

参考http://blog.csdn.net/idwtwt/article/details/50964302我们知道inetsw源自于inetsw_array

/* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ static struct inet_protosw inetsw_array[] = { { .type =SOCK_STREAM, .protocol =IPPROTO_TCP, .prot =&tcp_prot, .ops =&inet_stream_ops, .flags =INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type =SOCK_DGRAM, .protocol =IPPROTO_UDP, .prot =&udp_prot, .ops =&inet_dgram_ops, .flags =INET_PROTOSW_PERMANENT, },{ .type =SOCK_DGRAM, .protocol =IPPROTO_ICMP, .prot =&ping_prot, .ops =&inet_dgram_ops, .flags =INET_PROTOSW_REUSE, },{ .type =SOCK_RAW, .protocol =IPPROTO_IP, /* wild card */ .prot =&raw_prot, .ops =&inet_sockraw_ops, .flags =INET_PROTOSW_REUSE, } };

结合起来看：
1 对于流式套接字ops字段将被赋值inet_stream_ops指针
2 对于数据报套接字ops字段将被赋值SOCK_DGRAM指针
3 对于原生套接字ops字段将被赋值inet_sockraw_ops指针
操作函数集详细定义为：

const struct proto_ops inet_stream_ops = { .family= PF_INET, .owner= THIS_MODULE, .release= inet_release, .bind= inet_bind, .connect= inet_stream_connect, .socketpair= sock_no_socketpair, .accept= inet_accept, .getname= inet_getname, .poll= tcp_poll, .ioctl= inet_ioctl, .listen= inet_listen, .shutdown= inet_shutdown, .setsockopt= sock_common_setsockopt, .getsockopt= sock_common_getsockopt, .sendmsg= inet_sendmsg, .recvmsg= inet_recvmsg, .mmap= sock_no_mmap, .sendpage= inet_sendpage, .splice_read= tcp_splice_read, .read_sock= tcp_read_sock, .peek_len= tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl= inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_stream_ops); const struct proto_ops inet_dgram_ops = { .family= PF_INET, .owner= THIS_MODULE, .release= inet_release, .bind= inet_bind, .connect= inet_dgram_connect, .socketpair= sock_no_socketpair, .accept= sock_no_accept, .getname= inet_getname, .poll= udp_poll, .ioctl= inet_ioctl, .listen= sock_no_listen, .shutdown= inet_shutdown, .setsockopt= sock_common_setsockopt, .getsockopt= sock_common_getsockopt, .sendmsg= inet_sendmsg, .recvmsg= inet_recvmsg, .mmap= sock_no_mmap, .sendpage= inet_sendpage, .set_peek_off= sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl= inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family= PF_INET, .owner= THIS_MODULE, .release= inet_release, .bind= inet_bind, .connect= inet_dgram_connect, .socketpair= sock_no_socketpair, .accept= sock_no_accept, .getname= inet_getname, .poll= datagram_poll, .ioctl= inet_ioctl, .listen= sock_no_listen, .shutdown= inet_shutdown, .setsockopt= sock_common_setsockopt, .getsockopt= sock_common_getsockopt, .sendmsg= inet_sendmsg, .recvmsg= inet_recvmsg, .mmap= sock_no_mmap, .sendpage= inet_sendpage, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl= inet_compat_ioctl, #endif };

可知三种类型的套接字，sock->ops->bind实际调用的都是inet_bind，我们分析下该函数：

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ //对于tcp, tcp_prot //对于tcp,是没有bind的,所以下面不会进入if//slab在inet_init中调用proto_register时建立 if (sk->sk_prot->bind) { err = sk->sk_prot->bind(sk, uaddr, addr_len); goto out; } err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) goto out; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. */ err = -EAFNOSUPPORT; if (addr->sin_family != AF_UNSPEC || addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; } tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed.It is unfortunate since * allowing applications to make a non-local bind solves * several problems with systems using dynamic addressing. * (ie. your servers still start up even if your ISDN link *is temporarily down) */ err = -EADDRNOTAVAIL; if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; snum = ntohs(addr->sin_port); err = -EACCES; if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; /*We keep a pair of addresses. rcv_saddr is the one *used by hash lookups, and saddr is used for transmit. * *In the BSD API these are the same except where it *would be illegal to use them (multicast/broadcast) in *which case the sending device address is used. */ lock_sock(sk); /* Check these errors (active socket, double bind). */ //检查错误,重复绑定? //如果套接字不在初始状态TCP_CLOSE，或者已经绑定端口了，则出错。 //一个socket最多可以绑定一个端口，而一个端口则可能被多个socket共用。 err = -EINVAL; if (sk->sk_state != TCP_CLOSE || inet->inet_num) goto out_release_sock; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ //对于tcp, tcp_prot, inet_csk_get_port //一般分配奇数 //端口可用的话返回0。 //将socket加入的bind哈希表中 //在如果成功找到哈希桶,就在inet_bind_hash函数中设置sk的端口号,而且将sk添加到哈希桶的拥有者队列中 //哈希桶结构对象为inet_bind_bucket //tcp协议的哈希表为tcp_hashinfo if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; }//inet_rcv_saddr表示绑定的地址，接收数据时用于查找socket if (inet->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); inet->inet_daddr = 0; inet->inet_dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: release_sock(sk); out: return err; }

分析该函数，我们发现如果套接字对应的实现协议本身有bind函数，会执行协议的bind函数然后调到函数结尾返回；如果对应的实现协议本身没有定义bind函数则往下执行。查看tcp和udp的协议描述块，我们没有发现有bind函数，所以对于tcp和udp来说if内的代码是没有执行的，而是往下执行。

//slab在inet_init中调用proto_register时建立 struct proto tcp_prot = { .name= "TCP", .owner= THIS_MODULE, .close= tcp_close, .connect= tcp_v4_connect, .disconnect= tcp_disconnect, .accept= inet_csk_accept, .ioctl= tcp_ioctl, .init= tcp_v4_init_sock, .destroy= tcp_v4_destroy_sock, .shutdown= tcp_shutdown, .setsockopt= tcp_setsockopt, .getsockopt= tcp_getsockopt, .recvmsg= tcp_recvmsg, .sendmsg= tcp_sendmsg, .sendpage= tcp_sendpage, .backlog_rcv= tcp_v4_do_rcv, .release_cb= tcp_release_cb, .hash= inet_hash, .unhash= inet_unhash, .get_port= inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count= &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem= sysctl_tcp_mem, .sysctl_wmem= sysctl_tcp_wmem, .sysctl_rmem= sysctl_tcp_rmem, .max_header= MAX_TCP_HEADER, .obj_size= sizeof(struct tcp_sock), .slab_flags= SLAB_DESTROY_BY_RCU, .twsk_prot= &tcp_timewait_sock_ops, .rsk_prot= &tcp_request_sock_ops, .h.hashinfo= &tcp_hashinfo, .no_autobind= true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif .diag_destroy= tcp_abort, }; //slab在inet_init中调用proto_register时建立struct proto udp_prot = { .name= "UDP", .owner= THIS_MODULE, .close= udp_lib_close, .connect= ip4_datagram_connect, .disconnect= udp_disconnect, .ioctl= udp_ioctl, .destroy= udp_destroy_sock, .setsockopt= udp_setsockopt, .getsockopt= udp_getsockopt, .sendmsg= udp_sendmsg, .recvmsg= udp_recvmsg, .sendpage= udp_sendpage, .backlog_rcv= __udp_queue_rcv_skb, .release_cb= ip4_datagram_release_cb, .hash= udp_lib_hash, .unhash= udp_lib_unhash, .rehash= udp_v4_rehash, .get_port= udp_v4_get_port, .memory_allocated= &udp_memory_allocated, .sysctl_mem= sysctl_udp_mem, .sysctl_wmem= &sysctl_udp_wmem_min, .sysctl_rmem= &sysctl_udp_rmem_min, .obj_size= sizeof(struct udp_sock), .h.udp_table= &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif .diag_destroy= udp_abort, };

只有对于原生类型的套接字才有自己的bind函数：

/* If the socket has its own bind function then use it. (RAW) */ //对于tcp, tcp_prot //对于tcp,是没有bind的,所以下面不会进入if//slab在inet_init中调用proto_register时建立 if (sk->sk_prot->bind) { err = sk->sk_prot->bind(sk, uaddr, addr_len); goto out; }

【读Linux内核(4.9.9)之bind系统调用】往下的代码主要是地址信息合法性检查，还有inet_sock对象的设置（实际上是udp_sock对象中的inetd对象——查看udp_sock定义，可知udp_sock中包含一个inet_sock对象），申请端口等。留意端口申请代码：

if (sk->sk_prot->get_port(sk, snum)) {/*->将sock对象加入哈希表udp_table*/ udp_v4_get_port --> udp_lib_get_port inetsw_array inet->saddr = inet->rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; }

查看协议描述块可知，对于tcp，调用的是inet_csk_get_port，对于udp调用的是udp_v4_get_port，我们只分析inet_csk_get_port：

/* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; //对于tcp, tcp_prot, tcp_hashinfo struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; int ret = 1, attempts = 5, port = snum; int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; struct inet_bind_bucket *tb; kuid_t uid = sock_i_uid(sk); u32 remaining, offset; if (port) { have_port: //对于tcp, tcp_prot, tcp_hashinfo //hinfo,即tcp_hashinfo在tcp_init初始化//inet_bhashfn是端口号port与上哈希表长度//head是哈希得到的链表头//每一项都是一个链表，存储值相同的tcp_sock(这些sock可能是端口复用的)。//根据端口号，确定所在的哈希桶 head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); //枚举链表中的每一项 inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) goto tb_found; goto tb_not_found; } again: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: //这种情况就是随机绑定一个没有使用的端口 /* 获取端口号的取值范围 */ inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = prandom_u32() % remaining; /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; smallest_size = -1; smallest_port = low; /* avoid compiler warning */other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; //对于tcp, tcp_prot, tcp_hashinfo head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); /* 从头遍历哈希桶 */ inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { /* 如果端口被使用了 */ if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(tb->fastuid, uid))) && (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; /* 记下这个端口使用者的个数 */ smallest_port = port; /* 记下这个端口 */ }//在以下的情况下可以重用端口： //1.绑定不同网络接口的可以使用同一个端口； //2.每一个设置了地址重用的并且都不处于listen状态的所有的套接字可以使用一个端口，这意味着它们都是主动外出的套接字，目的由它们自己掌握； //即便在1和2都不满足的情况下，使用不同源地址的服务器套接字也可以使用同一个端口//对于一般的tcp协议，该处理冲突的回调函数就是 inet_csk_bind_conflict/* 如果系统绑定的端口已经很多了，那么就判断端口是否有绑定冲突*/ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) goto tb_found; goto next_port; } goto tb_not_found; next_port: spin_unlock_bh(&head->lock); cond_resched(); } if (smallest_size != -1) { port = smallest_port; goto have_port; } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } return ret; tb_not_found: //对于tcp, tcp_prot, tcp_hashinfo //如果在哈希得到的链表中没有找到对应端口的元素，则新建一个插入到哈希链表中 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port); if (!tb) goto fail_unlock; tb_found: //端口被占用 if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size == -1) goto success; if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if ((reuse || (tb->fastreuseport > 0 && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(tb->fastuid, uid))) && smallest_size != -1 && --attempts >= 0) { spin_unlock_bh(&head->lock); goto again; } goto fail_unlock; } if (!reuse) tb->fastreuse = 0; if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) tb->fastreuseport = 0; } else { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; tb->fastuid = uid; } else { tb->fastreuseport = 0; } } success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: spin_unlock_bh(&head->lock); return ret; }

该函数除了得到可用的端口外，主要作用是将sock对象加入&hinfo->bhash哈希表。我们知道哈希表可以加快搜索的速度，这用于在数据接收过程中。在数据接收过程中，我们接收发往本机的数据报，根据是目的IP。但是系统中有很多进程，很多socket连接，并不知道数据是要给哪个进程的。通过端口号和哈希表快速定位接收数据的sock，然后将数据放到sock的接收队列中，等待用户线程取数据，这样完成了一次网络通信。
最后，总的来说bind系统调用，本质上是根据端口号，将socket调用得到的socket对象加入bind哈希表。