环境:
- 版本:kernel-5.4.54 amd64 双核 ubuntu18.04
- k8s集群网络组件:flannel,kube-proxy: ipvs
- 代码工具:vs code
- SNAT(源地址转换)是IPTABLES的NAT表的核心功能,广泛应用与路由器,云服务器,K8S集群等内网环境中,是内核网络子系统中不可或缺的功能
- IPTABLES的NAT完全依赖于netfilter的conntrack,对于没有进行conntrack的数据包无法进行NAT
- 在K8S集群中DNAT用于负载均衡,SNAT用来保证节点转发的数据包能回到节点去完成de-DNAT还原,而不是直接发给客户端。
- 客户端访问的是负载均衡IP,后端IP直接回包给客户端的话,客户端无法识别;
- 后端IP回包先转给负载均衡器,将后端IP还原成负载均衡IP之后再发给客户端
- IPTABLES和IPVS都可以实现DNAT负载均衡的功能,但是SNAT只能由IPTABLES实现
- 查看集群中IPTABLES的SNAT规则
root@cluster1-worker1:~# iptables -t nat -nL
Chain PREROUTING (policy ACCEPT)
targetprot opt sourcedestination
KUBE-SERVICESall--0.0.0.0/00.0.0.0/0/* kubernetes service portals */
DOCKERall--0.0.0.0/00.0.0.0/0ADDRTYPE match dst-type LOCALChain INPUT (policy ACCEPT)
targetprot opt sourcedestinationChain OUTPUT (policy ACCEPT)
targetprot opt sourcedestination
KUBE-SERVICESall--0.0.0.0/00.0.0.0/0/* kubernetes service portals */
DOCKERall--0.0.0.0/0!127.0.0.0/8ADDRTYPE match dst-type LOCALChain POSTROUTING (policy ACCEPT)
targetprot opt sourcedestination
KUBE-POSTROUTINGall--0.0.0.0/00.0.0.0/0/* kubernetes postrouting rules */
MASQUERADEall--172.17.0.0/160.0.0.0/0
RETURNall--10.244.0.0/1610.244.0.0/16
MASQUERADEall--10.244.0.0/16!224.0.0.0/4
RETURNall-- !10.244.0.0/1610.244.2.0/24
MASQUERADEall-- !10.244.0.0/1610.244.0.0/16
...
Chain KUBE-POSTROUTING (1 references)
targetprot opt sourcedestination
/* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */
MASQUERADEall--0.0.0.0/00.0.0.0/0match-set KUBE-LOOP-BACK dst,dst,srcRETURNall--0.0.0.0/00.0.0.0/0mark match ! 0x4000/0x4000
MARKall--0.0.0.0/00.0.0.0/0MARK xor 0x4000/* kubernetes service traffic requiring SNAT */
MASQUERADEall--0.0.0.0/00.0.0.0/0
...
分析MASQUERADE是如何SNAT的对于我们了解集群间网络通信很有帮助
2.概念 2.1 de-SNAT 为什么要做de-SNAT?
假设本机将POD1发出的包进行了SNAT,源IP从POD1-IP变成了HOST-IP;这样服务端回包目的地是HOST-IP,但是需要收包的是POD1,如果不de-SNAT把回包的目的地改为POD1-IP,POD1就无法收到数据包
2.2 netfilter中的与SNAT有关的钩子点
文章图片
K8S集群的SNAT规则是在POST_ROUTING做SNAT,在PRE_ROUTING做de-SNAT
3.代码分析 3.1 MASQUERADE在NAT表中注册的钩子函数
static struct xt_target masquerade_tg_reg[] __read_mostly = {
{
#if IS_ENABLED(CONFIG_IPV6)
.name= "MASQUERADE",
.family= NFPROTO_IPV6,
.target= masquerade_tg6,
.targetsize = sizeof(struct nf_nat_range),
.table= "nat",
.hooks= 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg6_checkentry,
.destroy= masquerade_tg_destroy,
.me= THIS_MODULE,
}, {
#endif
.name= "MASQUERADE",
.family= NFPROTO_IPV4,
.target= masquerade_tg,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.table= "nat",
.hooks= 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
.destroy= masquerade_tg_destroy,
.me= THIS_MODULE,
}
};
3.2 masquerade_tg分析
文章图片
static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_nat_range2 range;
const struct nf_nat_ipv4_multi_range_compat *mr;
/* 获取规则的配置和SNAT的可用端口范围 */
mr = par->targinfo;
range.flags = mr->range[0].flags;
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
/* 核心函数 */
return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
xt_out(par));
}
3.2.1 nf_nat_masquerade_ipv4分析
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
const struct nf_nat_range2 *range,
const struct net_device *out)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
enum ip_conntrack_info ctinfo;
struct nf_nat_range2 newrange;
const struct rtable *rt;
__be32 newsrc, nh;
WARN_ON(hooknum != NF_INET_POST_ROUTING);
/* 获取conntrack连接信息 */
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
/* Source address is 0.0.0.0 - locally generated packet that is
* probably not supposed to be masqueraded.
*/
if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
return NF_ACCEPT;
/* 获取路由表 */
rt = skb_rtable(skb);
/* 下一跳的地址 */
nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
/* 选择最合适的SNAT源地址 */
newsrc = https://www.it610.com/article/inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
if (!newsrc) {
pr_info("%s ate my IP address\n", out->name);
return NF_DROP;
}nat = nf_ct_nat_ext_add(ct);
if (nat)
nat->masq_index = out->ifindex;
/* Transfer from original range. */
/* 设置可用的源地址和源端口范围 */
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
newrange.flags= range->flags | NF_NAT_RANGE_MAP_IPS;
newrange.min_addr.ip = newsrc;
newrange.max_addr.ip = newsrc;
newrange.min_proto= range->min_proto;
newrange.max_proto= range->max_proto;
/* Hand modified range to generic setup. */
/* 根据可用范围确定SNAT源地址,并修改连接记录 */
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}
3.2.2 nf_nat_setup_info分析
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
/* Can't setup nat info for confirmed ct. */
if (nf_ct_is_confirmed(ct))
return NF_ACCEPT;
WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
maniptype != NF_NAT_MANIP_DST);
if (WARN_ON(nf_nat_initialized(ct, maniptype)))
return NF_DROP;
/* What we've got will look like inverse of reply. Normally
* this is what is in the conntrack, except for prior
* manipulations (future optimization: if num_manips == 0,
* orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
*/
nf_ct_invert_tuple(&curr_tuple,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* 从可用范围中获取唯一的五元组 */
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
/* Alter conntrack table so will recognize replies. */
/* 修改conntrack中的回包的五元组 */
nf_ct_invert_tuple(&reply, &new_tuple);
nf_conntrack_alter_reply(ct, &reply);
/* Non-atomic: we own this at the moment. */
/* 标识需要做的nat类型 */
if (maniptype == NF_NAT_MANIP_SRC)
ct->status |= IPS_SRC_NAT;
else
ct->status |= IPS_DST_NAT;
if (nfct_help(ct) && !nfct_seqadj(ct))
if (!nfct_seqadj_ext_add(ct))
return NF_DROP;
}/* 将连接记录添加到bysource表中 */
if (maniptype == NF_NAT_MANIP_SRC) {
unsigned int srchash;
spinlock_t *lock;
srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
hlist_add_head_rcu(&ct->nat_bysource,
&nf_nat_bysource[srchash]);
spin_unlock_bh(lock);
}/* It's done. */
if (maniptype == NF_NAT_MANIP_DST)
ct->status |= IPS_DST_NAT_DONE;
else
ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
3.3.3 get_unique_tuple分析
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
* we change the source to map into the range. For NF_INET_PRE_ROUTING
* and NF_INET_LOCAL_OUT, we change the destination to map into the
* range. It might not be possible to get a unique tuple, but we try.
* At worst (or if we race), we will end up with a final duplicate in
* __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range2 *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
const struct nf_conntrack_zone *zone;
struct net *net = nf_ct_net(ct);
zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
* and that same mapping gives a unique tuple within the given
* range, use that.
*
* This is only required for source (ie. NAT/masq) mappings.
* So far, we don't do local source mappings, so multiple
* manips not an issue.
*/
/* 先尝试判断不做SNAT是否满足可用范围,或者在最近SNAT的连接记录中获取SNAT源地址 */
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* SNAT和非随机端口会走到这里 */
/* try the original tuple first */
/* 不做SNAT判断是否满足可用范围 */
if (in_range(orig_tuple, range)) {
/* 判断五元组是否唯一 */
if (!nf_nat_used_tuple(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
/* 根据源地址hash,在最近SNAT的连接记录中获取SNAT源地址 */
} else if (find_appropriate_src(net, zone,
orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
/* 判断五元组是否唯一 */
if (!nf_nat_used_tuple(tuple, ct))
return;
}
}/* 随机端口或者没有找到符合上面判断的五元组时会走到这里 */
/* 2) Select the least-used IP/proto combination in the given range */
*tuple = *orig_tuple;
/* 从源地址范围中获取最合适的源地址 */
find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
* the range to make a unique tuple.
*//* Only bother mapping if it's not already in range and unique */
/* 先不修改端口判断五元组是否满足范围 */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
&range->min_proto,
&range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
/* 非随机端口 && 设置了端口范围 && 端口满足范围 && 五元组唯一
* 会走到这里 直接返回确认的五元组*/
return;
} else if (!nf_nat_used_tuple(tuple, ct)) {
/* 非随机端口 && 没有设置了端口范围 && 五元组唯一
* 会走到这里 直接返回确认的五元组*/
return;
}
}/* Last chance: get protocol to try to obtain unique tuple. */
/* 在可用范围中选择一个合适的端口(五元组唯一,端口在范围内) */
nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}
先不做对数据包的修改,这里只修改conntrack连接记录,后续根据连接记录对数据包修改
对数据包的修改和de-SNAT在NAT分析文档中:IPTABLES的连接跟踪与NAT分析
3.3 SNAT与MASQ区别 3.3.1 SNAT钩子函数
static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name= "SNAT",
.revision= 0,
.checkentry = xt_nat_checkentry_v0,
.destroy= xt_nat_destroy,
.target= xt_snat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family= NFPROTO_IPV4,
.table= "nat",
.hooks= (1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me= THIS_MODULE,
},
...
3.3.2 xt_snat_target_v0分析
static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct != NULL &&
(ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
/* 获取范围 */
xt_nat_convert_range(&range, &mr->range[0]);
/* 根据可用范围确定SNAT源地址,并修改连接记录 */
return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}
【SNAT的MASQUERADE地址选择与端口选择】可以看到SNAT和MASQ最后都调用了nf_nat_setup_info,区别是MASQ在前面有一个选择最合适源IP的步骤。
推荐阅读
- c/c++|有感 Visual Studio 2015 RTM 简介 - 八年后回归 Dot Net,终于迎来了 Mvc 时代,盼走了 Web 窗体时代...
- C/C++|C/C++ basis 02
- Qt实战|Qt+OpenCV联合开发(二十一)--图像翻转与旋转
- Qt实战|Qt+OpenCV联合开发(十四)--图像感兴趣区域(ROI)的提取
- Qt实战|Qt+OpenCV联合开发(十三)--通道分离与合并
- opencv|Qt+OpenCV联合开发(十六)--图像几何形状绘制
- Qt实战|Qt+OpenCV联合开发(十七)--随机数与随机颜色
- IPTABLES的连接跟踪与NAT分析
- IPVS分析