读Linux内核(4.9.9)之bind系统调用
在调用socket成功返回后,我们得到与socket关联的文件描述符。然后我们以该描述符和sockaddr地址结构对象为参数调用bind,就实现了socket对象地址的绑定。那这个绑定到底是个什么意思?这个绑定操作是必须吗?绑定操作之后,socket对象又发生了什么?也许还有更多的疑问,我们在协议栈的源码寻找答案。先贴上bind系统调用的源码:
/*
* Bind a name to a socket. Nothing much to do here since it's
* the protocol's responsibility to handle the local address.
*
* We move the socket address to kernel space before we call
* the protocol layer (having also checked the address is ok).
*/
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
//根据文件描述符获得file对象,然后从private_data成员获得socket对象
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
//从用户空间拷贝地址信息到内核
err = move_addr_to_kernel(umyaddr, addrlen, &address);
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
//sock->ops在 inet_create 初始化
//在inetsw_array数组有套接字类型和协议的对应描述对象//对于流式套接字,sock->ops为 inet_stream_ops//对于数据报套接字,sock->ops为 inet_dgram_ops
//sock->ops->bind为 inet_bind//对于tcp实质是将sock对象放到哈希表中,对于tcp协议,则放到 tcp_hashinfo 的bhashif (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen);
}
fput_light(sock->file, fput_needed);
}
return err;
}
通过传入的文件描述符,我们找到对应的file对象,然后通过file对象的private_data指针找到socket对象。接着将包含地址信息的sockaddr对象从用户空间拷贝到内核空间,接着是安全方面的东西, 接着就是我们关注的重点:
if (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen);
我们在执行socket系统调用时,会传入套接字的类型(流式套接字SOCK_STREAM,数据包套接字SOCK_DGRAM,原生套接字SOCK_RAW),根据该类型,我们会将相应类型的操作函数集指针,以及协议描述块指针赋值给socket对象,如对于流式套接字,操作函数集为inet_stream_ops,实现协议为描述块为tcp_prot:
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
...
//根据套接字类型sock->type得到协议的关联对象
//套接字类型与协议的关联对象数组定义在inetsw_array,inet_init函数中会加载到inetsw拉链表
//其实如果只有inet协议族,则这个链表只有一个对象
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}...//inetsw_array//特定协议的操作函数集,声明在net.h
//对于tcp: inet_stream_ops
sock->ops = answer->ops;
//INET层协议描述块
//对于tcp, tcp_prot
answer_prot = answer->prot;
answer_flags = answer->flags;
...}
参考http://blog.csdn.net/idwtwt/article/details/50964302我们知道inetsw源自于inetsw_array
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
static struct inet_protosw inetsw_array[] =
{
{
.type =SOCK_STREAM,
.protocol =IPPROTO_TCP,
.prot =&tcp_prot,
.ops =&inet_stream_ops,
.flags =INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
}, {
.type =SOCK_DGRAM,
.protocol =IPPROTO_UDP,
.prot =&udp_prot,
.ops =&inet_dgram_ops,
.flags =INET_PROTOSW_PERMANENT,
},{
.type =SOCK_DGRAM,
.protocol =IPPROTO_ICMP,
.prot =&ping_prot,
.ops =&inet_dgram_ops,
.flags =INET_PROTOSW_REUSE,
},{
.type =SOCK_RAW,
.protocol =IPPROTO_IP, /* wild card */
.prot =&raw_prot,
.ops =&inet_sockraw_ops,
.flags =INET_PROTOSW_REUSE,
}
};
结合起来看:
1 对于流式套接字ops字段将被赋值inet_stream_ops指针
2 对于数据报套接字ops字段将被赋值SOCK_DGRAM指针
3 对于原生套接字ops字段将被赋值inet_sockraw_ops指针
操作函数集详细定义为:
const struct proto_ops inet_stream_ops = {
.family= PF_INET,
.owner= THIS_MODULE,
.release= inet_release,
.bind= inet_bind,
.connect= inet_stream_connect,
.socketpair= sock_no_socketpair,
.accept= inet_accept,
.getname= inet_getname,
.poll= tcp_poll,
.ioctl= inet_ioctl,
.listen= inet_listen,
.shutdown= inet_shutdown,
.setsockopt= sock_common_setsockopt,
.getsockopt= sock_common_getsockopt,
.sendmsg= inet_sendmsg,
.recvmsg= inet_recvmsg,
.mmap= sock_no_mmap,
.sendpage= inet_sendpage,
.splice_read= tcp_splice_read,
.read_sock= tcp_read_sock,
.peek_len= tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl= inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
const struct proto_ops inet_dgram_ops = {
.family= PF_INET,
.owner= THIS_MODULE,
.release= inet_release,
.bind= inet_bind,
.connect= inet_dgram_connect,
.socketpair= sock_no_socketpair,
.accept= sock_no_accept,
.getname= inet_getname,
.poll= udp_poll,
.ioctl= inet_ioctl,
.listen= sock_no_listen,
.shutdown= inet_shutdown,
.setsockopt= sock_common_setsockopt,
.getsockopt= sock_common_getsockopt,
.sendmsg= inet_sendmsg,
.recvmsg= inet_recvmsg,
.mmap= sock_no_mmap,
.sendpage= inet_sendpage,
.set_peek_off= sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl= inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
/*
* For SOCK_RAW sockets;
should be the same as inet_dgram_ops but without
* udp_poll
*/
static const struct proto_ops inet_sockraw_ops = {
.family= PF_INET,
.owner= THIS_MODULE,
.release= inet_release,
.bind= inet_bind,
.connect= inet_dgram_connect,
.socketpair= sock_no_socketpair,
.accept= sock_no_accept,
.getname= inet_getname,
.poll= datagram_poll,
.ioctl= inet_ioctl,
.listen= sock_no_listen,
.shutdown= inet_shutdown,
.setsockopt= sock_common_setsockopt,
.getsockopt= sock_common_getsockopt,
.sendmsg= inet_sendmsg,
.recvmsg= inet_recvmsg,
.mmap= sock_no_mmap,
.sendpage= inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl= inet_compat_ioctl,
#endif
};
可知三种类型的套接字,sock->ops->bind实际调用的都是inet_bind,我们分析下该函数:
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
u32 tb_id = RT_TABLE_LOCAL;
int err;
/* If the socket has its own bind function then use it. (RAW) */
//对于tcp, tcp_prot
//对于tcp,是没有bind的,所以下面不会进入if//slab在inet_init中调用proto_register时建立
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
if (addr->sin_family != AF_INET) {
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
* only if s_addr is INADDR_ANY.
*/
err = -EAFNOSUPPORT;
if (addr->sin_family != AF_UNSPEC ||
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
} tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed.It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
*is temporarily down)
*/
err = -EADDRNOTAVAIL;
if (!net->ipv4.sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
goto out;
snum = ntohs(addr->sin_port);
err = -EACCES;
if (snum && snum < PROT_SOCK &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/*We keep a pair of addresses. rcv_saddr is the one
*used by hash lookups, and saddr is used for transmit.
*
*In the BSD API these are the same except where it
*would be illegal to use them (multicast/broadcast) in
*which case the sending device address is used.
*/
lock_sock(sk);
/* Check these errors (active socket, double bind). */ //检查错误,重复绑定?
//如果套接字不在初始状态TCP_CLOSE,或者已经绑定端口了,则出错。
//一个socket最多可以绑定一个端口,而一个端口则可能被多个socket共用。
err = -EINVAL;
if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0;
/* Use device */ /* Make sure we are allowed to bind here. */ //对于tcp, tcp_prot, inet_csk_get_port
//一般分配奇数
//端口可用的话返回0。 //将socket加入的bind哈希表中 //在如果成功找到哈希桶,就在inet_bind_hash函数中设置sk的端口号,而且将sk添加到哈希桶的拥有者队列中 //哈希桶结构对象为inet_bind_bucket //tcp协议的哈希表为tcp_hashinfo
if ((snum || !inet->bind_address_no_port) &&
sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}//inet_rcv_saddr表示绑定的地址,接收数据时用于查找socket
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
分析该函数,我们发现如果套接字对应的实现协议本身有bind函数,会执行协议的bind函数然后调到函数结尾返回;如果对应的实现协议本身没有定义bind函数则往下执行。查看tcp和udp的协议描述块,我们没有发现有bind函数,所以对于tcp和udp来说if内的代码是没有执行的,而是往下执行。
//slab在inet_init中调用proto_register时建立
struct proto tcp_prot = {
.name= "TCP",
.owner= THIS_MODULE,
.close= tcp_close,
.connect= tcp_v4_connect,
.disconnect= tcp_disconnect,
.accept= inet_csk_accept,
.ioctl= tcp_ioctl,
.init= tcp_v4_init_sock,
.destroy= tcp_v4_destroy_sock,
.shutdown= tcp_shutdown,
.setsockopt= tcp_setsockopt,
.getsockopt= tcp_getsockopt,
.recvmsg= tcp_recvmsg,
.sendmsg= tcp_sendmsg,
.sendpage= tcp_sendpage,
.backlog_rcv= tcp_v4_do_rcv,
.release_cb= tcp_release_cb,
.hash= inet_hash,
.unhash= inet_unhash,
.get_port= inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count= &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem= sysctl_tcp_mem,
.sysctl_wmem= sysctl_tcp_wmem,
.sysctl_rmem= sysctl_tcp_rmem,
.max_header= MAX_TCP_HEADER,
.obj_size= sizeof(struct tcp_sock),
.slab_flags= SLAB_DESTROY_BY_RCU,
.twsk_prot= &tcp_timewait_sock_ops,
.rsk_prot= &tcp_request_sock_ops,
.h.hashinfo= &tcp_hashinfo,
.no_autobind= true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
.diag_destroy= tcp_abort,
};
//slab在inet_init中调用proto_register时建立struct proto udp_prot = {
.name= "UDP",
.owner= THIS_MODULE,
.close= udp_lib_close,
.connect= ip4_datagram_connect,
.disconnect= udp_disconnect,
.ioctl= udp_ioctl,
.destroy= udp_destroy_sock,
.setsockopt= udp_setsockopt,
.getsockopt= udp_getsockopt,
.sendmsg= udp_sendmsg,
.recvmsg= udp_recvmsg,
.sendpage= udp_sendpage,
.backlog_rcv= __udp_queue_rcv_skb,
.release_cb= ip4_datagram_release_cb,
.hash= udp_lib_hash,
.unhash= udp_lib_unhash,
.rehash= udp_v4_rehash,
.get_port= udp_v4_get_port,
.memory_allocated= &udp_memory_allocated,
.sysctl_mem= sysctl_udp_mem,
.sysctl_wmem= &sysctl_udp_wmem_min,
.sysctl_rmem= &sysctl_udp_rmem_min,
.obj_size= sizeof(struct udp_sock),
.h.udp_table= &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
.diag_destroy= udp_abort,
};
只有对于原生类型的套接字才有自己的bind函数:
/* If the socket has its own bind function then use it. (RAW) */
//对于tcp, tcp_prot
//对于tcp,是没有bind的,所以下面不会进入if//slab在inet_init中调用proto_register时建立
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
【读Linux内核(4.9.9)之bind系统调用】往下的代码主要是地址信息合法性检查,还有inet_sock对象的设置(实际上是udp_sock对象中的inetd对象——查看udp_sock定义,可知udp_sock中包含一个inet_sock对象),申请端口等。留意端口申请代码:
if (sk->sk_prot->get_port(sk, snum)) {/*->将sock对象加入哈希表udp_table*/ udp_v4_get_port --> udp_lib_get_port inetsw_array
inet->saddr = inet->rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
查看协议描述块可知,对于tcp,调用的是inet_csk_get_port,对于udp调用的是udp_v4_get_port,我们只分析inet_csk_get_port:
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
* We try to allocate an odd port (and leave even ports for connect())
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
//对于tcp, tcp_prot, tcp_hashinfo
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, attempts = 5, port = snum;
int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
if (port) {
have_port:
//对于tcp, tcp_prot, tcp_hashinfo
//hinfo,即tcp_hashinfo在tcp_init初始化//inet_bhashfn是端口号port与上哈希表长度//head是哈希得到的链表头//每一项都是一个链表,存储值相同的tcp_sock(这些sock可能是端口复用的)。//根据端口号,确定所在的哈希桶
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
//枚举链表中的每一项
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port)
goto tb_found;
goto tb_not_found;
}
again:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
//这种情况就是随机绑定一个没有使用的端口
/* 获取端口号的取值范围 */
inet_get_local_port_range(net, &low, &high);
high++;
/* [32768, 60999] -> [32768, 61000[ */
if (high - low < 4)
attempt_half = 0;
if (attempt_half) {
int half = low + (((high - low) >> 2) << 1);
if (attempt_half == 1)
high = half;
else
low = half;
}
remaining = high - low;
if (likely(remaining > 1))
remaining &= ~1U;
offset = prandom_u32() % remaining;
/* __inet_hash_connect() favors ports having @low parity
* We do the opposite to not pollute connect() users.
*/
offset |= 1U;
smallest_size = -1;
smallest_port = low;
/* avoid compiler warning */other_parity_scan:
port = low + offset;
for (i = 0;
i < remaining;
i += 2, port += 2) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
continue;
//对于tcp, tcp_prot, tcp_hashinfo
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
/* 从头遍历哈希桶 */
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
/* 如果端口被使用了 */
if (((tb->fastreuse > 0 && reuse) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
/* 记下这个端口使用者的个数 */
smallest_port = port;
/* 记下这个端口 */
}//在以下的情况下可以重用端口:
//1.绑定不同网络接口的可以使用同一个端口;
//2.每一个设置了地址重用的并且都不处于listen状态的所有的套接字可以使用一个端口,这意味着它们都是主动外出的套接字,目的由它们自己掌握;
//即便在1和2都不满足的情况下,使用不同源地址的服务器套接字也可以使用同一个端口//对于一般的tcp协议,该处理冲突的回调函数就是 inet_csk_bind_conflict/* 如果系统绑定的端口已经很多了,那么就判断端口是否有绑定冲突*/
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
goto tb_found;
goto next_port;
}
goto tb_not_found;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
} if (smallest_size != -1) {
port = smallest_port;
goto have_port;
}
offset--;
if (!(offset & 1))
goto other_parity_scan;
if (attempt_half == 1) {
/* OK we now try the upper half of the range */
attempt_half = 2;
goto other_half_scan;
}
return ret;
tb_not_found:
//对于tcp, tcp_prot, tcp_hashinfo //如果在哈希得到的链表中没有找到对应端口的元素,则新建一个插入到哈希链表中
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb)
goto fail_unlock;
tb_found:
//端口被占用
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
if (((tb->fastreuse > 0 && reuse) ||
(tb->fastreuseport > 0 &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1)
goto success;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
if ((reuse ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock_bh(&head->lock);
goto again;
}
goto fail_unlock;
}
if (!reuse)
tb->fastreuse = 0;
if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
tb->fastreuseport = 0;
} else {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
tb->fastuid = uid;
} else {
tb->fastreuseport = 0;
}
}
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
spin_unlock_bh(&head->lock);
return ret;
}
该函数除了得到可用的端口外,主要作用是将sock对象加入&hinfo->bhash哈希表。我们知道哈希表可以加快搜索的速度,这用于在数据接收过程中。在数据接收过程中,我们接收发往本机的数据报,根据是目的IP。但是系统中有很多进程,很多socket连接,并不知道数据是要给哪个进程的。通过端口号和哈希表快速定位接收数据的sock,然后将数据放到sock的接收队列中,等待用户线程取数据,这样完成了一次网络通信。
最后,总的来说bind系统调用,本质上是根据端口号,将socket调用得到的socket对象加入bind哈希表。
推荐阅读
- 考研英语阅读终极解决方案——阅读理解如何巧拿高分
- Ⅴ爱阅读,亲子互动——打卡第178天
- 期刊|期刊 | 国内核心期刊之(北大核心)
- “成长”读书社群招募
- 上班后阅读开始变成一件奢侈的事
- 人间词话的智慧
- 读司马懿,知人间事,品百味人生
- 以读攻“毒”唤新活动曹彦斌打卡第二天
- 私通和背叛,他怎么看(——晨读小记)
- 【0212读书感悟】