Table of Contents
__sock_create函数
结构
socket_state
struct socket
struct sock
struct proto_ops
函数原型
__sock_create
security_socket_create
call_int_hook
socket_create
selinux_socket_create
socket_sockcreate_sid
security_transition_sid
sock_alloc
security_socket_post_create
【Linux内核协议栈- 创建socket(__sock_create函数调用关系)】socket_post_create
selinux_socket_post_create
__sock_create函数
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern);
其在linux内核中的位置:
文章图片
结构 socket_state
//https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/net.h#L54
typedef enum {
SS_FREE = 0,/* not allocated*/
SS_UNCONNECTED,/* unconnected to any socket */
SS_CONNECTING,/* in process of connecting */
SS_CONNECTED,/* connected to socket*/
SS_DISCONNECTING/* in process of disconnecting */
} socket_state;
struct socket
/**
*struct socket - general BSD socket
*@state: socket state (%SS_CONNECTED, etc)
*@type: socket type (%SOCK_STREAM, etc)
*@flags: socket flags (%SOCK_NOSPACE, etc)
*@ops: protocol specific socket operations
*@file: File back pointer for gc
*@sk: internal networking protocol agnostic socket representation
*@wq: wait queue for several uses
*https://elixir.bootlin.com/linux/latest/source/include/linux/net.h#L112
*/
struct socket {
socket_statestate;
shorttype;
unsigned longflags;
struct file*file;
struct sock*sk;
const struct proto_ops *ops;
struct socket_wq wq;
};
struct sock
/**
* struct sock - network layer representation of sockets
* @__sk_common: shared layout with inet_timewait_sock
* @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
* @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
* @sk_lock: synchronizer
* @sk_kern_sock: True if sock is using kernel lock classes
* @sk_rcvbuf: size of receive buffer in bytes
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
* @sk_rx_skb_cache: cache copy of recently accessed RX skb
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
* @sk_write_queue: Packet sending queue
* @sk_omem_alloc: "o" is "option" or "other"
* @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward
* @sk_napi_id: id of the last napi context to receive data for sk
* @sk_ll_usec: usecs to busypoll when there is no data
* @sk_allocation: allocation mode
* @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
* @sk_pacing_status: Pacing status (requested, handled by sch_fq)
* @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
* @sk_sndbuf: size of send buffer in bytes
* @__sk_flags_offset: empty field used to determine location of bitfield
* @sk_padding: unused element for alignment
* @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
* @sk_no_check_rx: allow zero checksum in RX packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
* @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
* @sk_route_forced_caps: static, forced route capabilities
*(set in tcp_init_sock())
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
* @sk_gso_max_size: Maximum GSO segment size to build
* @sk_gso_max_segs: Maximum number of GSO segments
* @sk_pacing_shift: scaling factor for TCP Small Queues
* @sk_lingertime: %SO_LINGER l_linger setting
* @sk_backlog: always used with the per-socket spinlock held
* @sk_callback_lock: used with the callbacks in the end of this struct
* @sk_error_queue: rarely used
* @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
*IPV6_ADDRFORM for instance)
* @sk_err: last error
* @sk_err_soft: errors that don't cause failure but are the cause of a
*persistent failure not just 'timed out'
* @sk_drops: raw/udp drops counter
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
* @sk_priority: %SO_PRIORITY setting
* @sk_type: socket type (%SOCK_STREAM, etc)
* @sk_protocol: which protocol this socket belongs in this network family
* @sk_peer_pid: &struct pid for this socket's peer
* @sk_peer_cred: %SO_PEERCRED setting
* @sk_rcvlowat: %SO_RCVLOWAT setting
* @sk_rcvtimeo: %SO_RCVTIMEO setting
* @sk_sndtimeo: %SO_SNDTIMEO setting
* @sk_txhash: computed flow hash for use on transmit
* @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING socket options
* @sk_tskey: counter to disambiguate concurrent tstamp requests
* @sk_zckey: counter to order MSG_ZEROCOPY notifications
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data
* @sk_frag: cached page frag
* @sk_peek_off: current peek_offset value
* @sk_send_head: front of stuff to transmit
* @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
* @sk_tx_skb_cache: cache copy of recently accessed TX skb
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
* @sk_memcg: this socket's memory cgroup association
* @sk_write_pending: a write to stream socket waits to start
* @sk_state_change: callback to indicate change in the state of the sock
* @sk_data_ready: callback to indicate there is data to be processed
* @sk_write_space: callback to indicate there is bf sending space available
* @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
* @sk_backlog_rcv: callback to process the backlog
* @sk_validate_xmit_skb: ptr to an optional validate function
* @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
* @sk_reuseport_cb: reuseport group container
* @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
* @sk_rcu: used during RCU grace period
* @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* https://elixir.bootlin.com/linux/latest/source/include/net/sock.h#L346
*/
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common;
#define sk_node__sk_common.skc_node
#define sk_nulls_node__sk_common.skc_nulls_node
#define sk_refcnt__sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_XPS
#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
#endif#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end__sk_common.skc_dontcopy_end
#define sk_hash__sk_common.skc_hash
#define sk_portpair__sk_common.skc_portpair
#define sk_num__sk_common.skc_num
#define sk_dport__sk_common.skc_dport
#define sk_addrpair__sk_common.skc_addrpair
#define sk_daddr__sk_common.skc_daddr
#define sk_rcv_saddr__sk_common.skc_rcv_saddr
#define sk_family__sk_common.skc_family
#define sk_state__sk_common.skc_state
#define sk_reuse__sk_common.skc_reuse
#define sk_reuseport__sk_common.skc_reuseport
#define sk_ipv6only__sk_common.skc_ipv6only
#define sk_net_refcnt__sk_common.skc_net_refcnt
#define sk_bound_dev_if__sk_common.skc_bound_dev_if
#define sk_bind_node__sk_common.skc_bind_node
#define sk_prot__sk_common.skc_prot
#define sk_net__sk_common.skc_net
#define sk_v6_daddr__sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
#define sk_cookie__sk_common.skc_cookie
#define sk_incoming_cpu__sk_common.skc_incoming_cpu
#define sk_flags__sk_common.skc_flags
#define sk_rxhash__sk_common.skc_rxhash socket_lock_tsk_lock;
atomic_tsk_drops;
intsk_rcvlowat;
struct sk_buff_head sk_error_queue;
struct sk_buff*sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
* Note : rmem_alloc is in this structure to fill a hole
* on 64bit arches, not because its logically part of
* backlog.
*/
struct {
atomic_t rmem_alloc;
intlen;
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc intsk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned intsk_ll_usec;
/* ===== mostly read cache line ===== */
unsigned intsk_napi_id;
#endif
intsk_rcvbuf;
struct sk_filter __rcu *sk_filter;
union {
struct socket_wq __rcu *sk_wq;
/* private: */
struct socket_wq *sk_wq_raw;
/* public: */
};
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
atomic_tsk_omem_alloc;
intsk_sndbuf;
/* ===== cache line for TX ===== */
intsk_wmem_queued;
refcount_tsk_wmem_alloc;
unsigned longsk_tsq_flags;
union {
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
struct sk_buff*sk_tx_skb_cache;
struct sk_buff_head sk_write_queue;
__s32sk_peek_off;
intsk_write_pending;
__u32sk_dst_pending_confirm;
u32sk_pacing_status;
/* see enum sk_pacing */
longsk_sndtimeo;
struct timer_list sk_timer;
__u32sk_priority;
__u32sk_mark;
unsigned longsk_pacing_rate;
/* bytes per second */
unsigned longsk_max_pacing_rate;
struct page_frag sk_frag;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
netdev_features_t sk_route_forced_caps;
intsk_gso_type;
unsigned intsk_gso_max_size;
gfp_tsk_allocation;
__u32sk_txhash;
/*
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
u8sk_padding : 1,
sk_kern_sock : 1,
sk_no_check_tx : 1,
sk_no_check_rx : 1,
sk_userlocks : 4;
u8sk_pacing_shift;
u16sk_type;
u16sk_protocol;
u16sk_gso_max_segs;
unsigned longsk_lingertime;
struct proto*sk_prot_creator;
rwlock_tsk_callback_lock;
intsk_err,
sk_err_soft;
u32sk_ack_backlog;
u32sk_max_ack_backlog;
kuid_tsk_uid;
struct pid*sk_peer_pid;
const struct cred *sk_peer_cred;
longsk_rcvtimeo;
ktime_tsk_stamp;
#if BITS_PER_LONG==32
seqlock_tsk_stamp_seq;
#endif
u16sk_tsflags;
u8sk_shutdown;
u32sk_tskey;
atomic_tsk_zckey;
u8sk_clockid;
u8sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
struct socket*sk_socket;
void*sk_user_data;
#ifdef CONFIG_SECURITY
void*sk_security;
#endif
struct sock_cgroup_data sk_cgrp_data;
struct mem_cgroup *sk_memcg;
void(*sk_state_change)(struct sock *sk);
void(*sk_data_ready)(struct sock *sk);
void(*sk_write_space)(struct sock *sk);
void(*sk_error_report)(struct sock *sk);
int(*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
#ifdef CONFIG_SOCK_VALIDATE_XMIT
struct sk_buff*(*sk_validate_xmit_skb)(struct sock *sk,
struct net_device *dev,
struct sk_buff *skb);
#endif
void(*sk_destruct)(struct sock *sk);
struct sock_reuseport __rcu *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
struct bpf_sk_storage __rcu *sk_bpf_storage;
#endif
struct rcu_headsk_rcu;
};
struct proto_ops 参考:https://blog.csdn.net/Rong_Toa/article/details/105327127
struct proto_ops {
intfamily;
struct module *owner;
int(*release)(struct socket *sock);
int(*bind)(struct socket *sock,
struct sockaddr *myaddr,
int sockaddr_len);
int(*connect)(struct socket *sock,
struct sockaddr *vaddr,
int sockaddr_len, int flags);
int(*socketpair)(struct socket *sock1,
struct socket *sock2);
int(*accept)(struct socket *sock,
struct socket *newsock, int flags, bool kern);
int(*getname)(struct socket *sock,
struct sockaddr *addr,
int peer);
__poll_t (*poll)(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int(*ioctl)(struct socket *sock, unsigned int cmd,
unsigned long arg);
#ifdef CONFIG_COMPAT
int(*compat_ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
#endif
int(*gettstamp) (struct socket *sock, void __user *userstamp,
bool timeval, bool time32);
int(*listen)(struct socket *sock, int len);
int(*shutdown)(struct socket *sock, int flags);
int(*setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen);
int(*getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
int(*compat_setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen);
int(*compat_getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
#endif
void(*show_fdinfo)(struct seq_file *m, struct socket *sock);
int(*sendmsg)(struct socket *sock, struct msghdr *m,
size_t total_len);
/* Notes for implementing recvmsg:
* ===============================
* msg->msg_namelen should get updated by the recvmsg handlers
* iff msg_name != NULL. It is by default 0 to prevent
* returning uninitialized memory to user space.The recvfrom
* handlers can assume that msg.msg_name is either NULL or has
* a minimum size of sizeof(struct sockaddr_storage).
*/
int(*recvmsg)(struct socket *sock, struct msghdr *m,
size_t total_len, int flags);
int(*mmap)(struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t(*sendpage)(struct socket *sock, struct page *page,
int offset, size_t size, int flags);
ssize_t(*splice_read)(struct socket *sock,loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int(*set_peek_off)(struct sock *sk, int val);
int(*peek_len)(struct socket *sock);
/* The following functions are called internally by kernel with
* sock lock already held.
*/
int(*read_sock)(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
int(*sendpage_locked)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
size_t size);
int(*set_rcvlowat)(struct sock *sk, int val);
};
函数原型 __sock_create
/**
* __sock_create - creates a socket
* @net: net namespace
* @family: protocol family (AF_INET, ...)
* @type: communication type (SOCK_STREAM, ...)
* @protocol: protocol (0, ...)
* @res: new socket
* @kern: boolean for kernel space sockets
*
* Creates a new socket and assigns it to @res, passing through LSM.
* Returns 0 or an error. On failure @res is set to %NULL. @kern must
* be set to true if the socket resides in kernel space.
* This function internally uses GFP_KERNEL.
*/int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
*Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
family = PF_PACKET;
} err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE;
/* Not exactly a match, but its the
closest posix thing */
} sock->type = type;
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
security_socket_create
int security_socket_create(int family, int type, int protocol, int kern)
{
return call_int_hook(socket_create, 0, family, type, protocol, kern);
}
call_int_hook
#define call_int_hook(FUNC, IRC, ...) ({\
int RC = IRC;
\
do {\
struct security_hook_list *P;
\
\
hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
RC = P->hook.FUNC(__VA_ARGS__);
\
if (RC != 0)\
break;
\
}\
} while (0);
\
RC;
\
})
socket_create 翻阅linux4.20.11源码hooks.c,找到
LSM_HOOK_INIT(socket_create, selinux_socket_create),
selinux_socket_create
static int selinux_socket_create(int family, int type,
int protocol, int kern)
{
const struct task_security_struct *tsec = current_security();
u32 newsid;
u16 secclass;
int rc;
if (kern)
return 0;
secclass = socket_type_to_security_class(family, type, protocol);
rc = socket_sockcreate_sid(tsec, secclass, &newsid);
if (rc)
return rc;
return avc_has_perm(&selinux_state,
tsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}
socket_sockcreate_sid
/* socket security operations */static int socket_sockcreate_sid(const struct task_security_struct *tsec,
u16 secclass, u32 *socksid)
{
if (tsec->sockcreate_sid > SECSID_NULL) {
*socksid = tsec->sockcreate_sid;
return 0;
} return security_transition_sid(&selinux_state, tsec->sid, tsec->sid,
secclass, NULL, socksid);
}
security_transition_sid
int security_transition_sid(struct selinux_state *state,
u32 ssid, u32 tsid, u16 tclass,
const struct qstr *qstr, u32 *out_sid)
{
return security_compute_sid(state, ssid, tsid, tclass,
AVTAB_TRANSITION,
qstr ? qstr->name : NULL, out_sid, true);
}
sock_alloc 该函数中创建了inode
/**
* sock_alloc - allocate a socket
*
* Allocate a new inode and socket object. The two are bound together
* and initialised. The socket is then returned. If we are out of inodes
* NULL is returned. This functions uses GFP_KERNEL internally.
*/struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
return NULL;
sock = SOCKET_I(inode);
inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_op = &sockfs_inode_ops;
return sock;
}
EXPORT_SYMBOL(sock_alloc);
security_socket_post_create socket_post_create
int security_socket_post_create(struct socket *sock, int family,
int type, int protocol, int kern)
{
return call_int_hook(socket_post_create, 0, sock, family, type,
protocol, kern);
}...LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create),
selinux_socket_post_create
static int selinux_socket_post_create(struct socket *sock, int family,
int type, int protocol, int kern)
{
const struct task_security_struct *tsec = current_security();
struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
struct sk_security_struct *sksec;
u16 sclass = socket_type_to_security_class(family, type, protocol);
u32 sid = SECINITSID_KERNEL;
int err = 0;
if (!kern) {
err = socket_sockcreate_sid(tsec, sclass, &sid);
if (err)
return err;
} isec->sclass = sclass;
isec->sid = sid;
isec->initialized = LABEL_INITIALIZED;
if (sock->sk) {
sksec = sock->sk->sk_security;
sksec->sclass = sclass;
sksec->sid = sid;
/* Allows detection of the first association on this socket */
if (sksec->sclass == SECCLASS_SCTP_SOCKET)
sksec->sctp_assoc_state = SCTP_ASSOC_UNSET;
err = selinux_netlbl_socket_post_create(sock->sk, family);
} return err;
}
参考链接:
https://elixir.bootlin.com/linux/latest/source/net/socket.c#L1362
https://elixir.bootlin.com/linux/latest/source/security/security.c#L2010
详解请听下回分解。