diff options
| author | Eric Dumazet <edumazet@google.com> | 2014-08-17 12:13:07 +0300 |
|---|---|---|
| committer | doc <doc.divxm@gmail.com> | 2014-08-17 12:13:13 +0300 |
| commit | 9a2c6049d43de6f470990fc83a2dd86074c09fd8 (patch) | |
| tree | 935813c0dce03fdafe7b81f6cc9bb9e04dc752ec | |
| parent | c71fdd1a459301651f35e4e12a5522018a358123 (diff) | |
inetpeer: get rid of ip_id_count
[ Upstream commit 73f156a6e8c1074ac6327e0abd1169e95eb66463 ]
Ideally, we would need to generate IP ID using a per destination IP
generator.
linux kernels used inet_peer cache for this purpose, but this had a huge
cost on servers disabling MTU discovery.
1) each inet_peer struct consumes 192 bytes
2) inetpeer cache uses a binary tree of inet_peer structs,
with a nominal size of ~66000 elements under load.
3) lookups in this tree are hitting a lot of cache lines, as tree depth
is about 20.
4) If server deals with many tcp flows, we have a high probability of
not finding the inet_peer, allocating a fresh one, inserting it in
the tree with same initial ip_id_count, (cf secure_ip_id())
5) We garbage collect inet_peer aggressively.
IP ID generation do not have to be 'perfect'
Goal is trying to avoid duplicates in a short period of time,
so that reassembly units have a chance to complete reassembly of
fragments belonging to one message before receiving other fragments
with a recycled ID.
We simply use an array of generators, and a Jenkin hash using the dst IP
as a key.
ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it
belongs (it is only used from this file)
secure_ip_id() and secure_ipv6_id() no longer are needed.
Rename ip_select_ident_more() to ip_select_ident_segs() to avoid
unnecessary decrement/increment of the number of segments.
Conflicts:
drivers/net/ppp/pptp.c
include/net/inetpeer.h
include/net/ip.h
include/net/ipip.h
include/net/ipv6.h
net/ipv4/igmp.c
net/ipv4/inetpeer.c
net/ipv4/ip_output.c
net/ipv4/ipmr.c
net/ipv4/raw.c
net/ipv4/route.c
net/ipv4/xfrm4_mode_tunnel.c
net/ipv6/ip6_output.c
net/netfilter/ipvs/ip_vs_xmit.c
Change-Id: I544360c7c781b61c31544b80db2ecaa720f24aea
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
| -rw-r--r-- | drivers/net/ppp/pptp.c | 2 | ||||
| -rw-r--r-- | include/net/inetpeer.h | 22 | ||||
| -rw-r--r-- | include/net/ip.h | 41 | ||||
| -rw-r--r-- | include/net/ipip.h | 2 | ||||
| -rw-r--r-- | include/net/ipv6.h | 9 | ||||
| -rw-r--r-- | include/net/secure_seq.h | 2 | ||||
| -rw-r--r-- | net/core/secure_seq.c | 23 | ||||
| -rw-r--r-- | net/ipv4/igmp.c | 6 | ||||
| -rw-r--r-- | net/ipv4/inetpeer.c | 32 | ||||
| -rw-r--r-- | net/ipv4/ip_output.c | 7 | ||||
| -rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
| -rw-r--r-- | net/ipv4/raw.c | 2 | ||||
| -rw-r--r-- | net/ipv4/route.c | 46 | ||||
| -rw-r--r-- | net/ipv4/xfrm4_mode_tunnel.c | 2 | ||||
| -rw-r--r-- | net/ipv6/ip6_output.c | 23 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 2 |
16 files changed, 114 insertions, 109 deletions
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 885dbdd9..dd90c445 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -281,7 +281,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) nf_reset(skb); skb->ip_summed = CHECKSUM_NONE; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); ip_local_out(skb); diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index b7e5a936..3f39c72e 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -39,10 +39,14 @@ struct inet_peer { struct list_head gc_list; struct rcu_head gc_rcu; }; + /* + * Once inet_peer is queued for deletion (refcnt == -1), following fields + * are not available: rid, tcp_ts, tcp_ts_stamp + * We can share memory with rcu_head to help keep inet_peer small. + */ union { struct { - atomic_t rid; - atomic_t ip_id_count; + atomic_t rid; /* Frag reception counter */ __u32 tcp_ts; __u32 tcp_ts_stamp; }; @@ -89,17 +93,13 @@ extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); extern void inetpeer_invalidate_tree(int family); +/* + * temporary check to make sure we dont access rid, tcp_ts, + * tcp_ts_stamp if no refcount is taken on inet_peer + */ static inline void inet_peer_refcheck(const struct inet_peer *p) { WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0); } - -static inline int inet_getid(struct inet_peer *p, int more) -{ - more++; - inet_peer_refcheck(p); - return atomic_add_return(more, &p->ip_id_count) - more; -} - -#endif +#endif /* _NET_INETPEER_H */ diff --git a/include/net/ip.h b/include/net/ip.h index ae06390d..b8b2484b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -268,27 +268,42 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst) !(dst_metric_locked(dst, RTAX_MTU))); } -extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more); +#define IP_IDENTS_SZ 2048u +extern atomic_t *ip_idents; -static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, struct sock *sk) +static inline u32 ip_idents_reserve(u32 hash, int segs) { - if (iph->frag_off & htons(IP_DF)) { - iph->id = (sk && inet_sk(sk)->inet_daddr) ? - htons(inet_sk(sk)->inet_id++) : 0; - } else - __ip_select_ident(iph, dst, 0); + atomic_t *id_ptr = ip_idents + hash % IP_IDENTS_SZ; + + return atomic_add_return(segs, id_ptr) - segs; } -static inline void ip_select_ident_more(struct iphdr *iph, struct dst_entry *dst, struct sock *sk, int more) +void __ip_select_ident(struct iphdr *iph, int segs); + +static inline void ip_select_ident_segs(struct sk_buff *skb, struct sock *sk, int segs) { - if (iph->frag_off & htons(IP_DF)) { + struct iphdr *iph = ip_hdr(skb); + + if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) { + /* This is only to work around buggy Windows95/2000 + * VJ compression implementations. If the ID field + * does not change, they drop every other packet in + * a TCP stream using header compression. + */ if (sk && inet_sk(sk)->inet_daddr) { iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += 1 + more; - } else + inet_sk(sk)->inet_id += segs; + } else { iph->id = 0; - } else - __ip_select_ident(iph, dst, more); + } + } else { + __ip_select_ident(iph, segs); + } +} + +static inline void ip_select_ident(struct sk_buff *skb, struct sock *sk) +{ + ip_select_ident_segs(skb, sk, 1); } diff --git a/include/net/ipip.h b/include/net/ipip.h index 7b5d3cc9..42a8054f 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -48,7 +48,7 @@ struct ip_tunnel_prl_entry { int pkt_len = skb->len - skb_transport_offset(skb); \ \ skb->ip_summed = CHECKSUM_NONE; \ - ip_select_ident(iph, &rt->dst, NULL); \ + ip_select_ident(skb, NULL); \ \ err = ip_local_out(skb); \ if (likely(net_xmit_eval(err) == 0)) { \ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8d59e1b4..36cc6b64 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -391,14 +391,19 @@ static inline int ipv6_addr_any(const struct in6_addr *a) } /* more secured version of ipv6_addr_hash() */ -static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; return jhash_3words(v, (__force u32)a->s6_addr32[2], (__force u32)a->s6_addr32[3], - ipv6_hash_secret); + initval); +} + +static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +{ + return __ipv6_addr_jhash(a, ipv6_hash_secret); } static inline int ipv6_addr_loopback(const struct in6_addr *a) diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 8efade01..ebcd6467 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -3,8 +3,6 @@ #include <linux/types.h> -extern __u32 secure_ip_id(__be32 daddr); -extern __u32 secure_ipv6_id(const __be32 daddr[4]); extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); extern u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 1008303b..b58dcee0 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -69,29 +69,6 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, #endif #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ - u32 hash[MD5_DIGEST_WORDS]; - - hash[0] = (__force __u32) daddr; - hash[1] = net_secret[13]; - hash[2] = net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) -{ - __u32 hash[4]; - - memcpy(hash, daddr, 16); - md5_transform(hash, net_secret); - - return hash[0]; -} __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d69f5784..fd98cfae 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -325,8 +325,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->daddr = fl4.daddr; pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; - pip->tot_len = 0; - ip_select_ident(pip, &rt->dst, NULL); + pip->tot_len = 0; /* filled in later */ + ip_select_ident(skb, NULL); ((u8*)&pip[1])[0] = IPOPT_RA; ((u8*)&pip[1])[1] = 4; ((u8*)&pip[1])[2] = 0; @@ -667,7 +667,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 60fd9d3d..6af6a8ea 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -22,6 +22,34 @@ #include <net/inetpeer.h> #include <net/secure_seq.h> +/* + * Theory of operations. + * We keep one entry for each peer IP address. The nodes contains long-living + * information about the peer which doesn't depend on routes. + * + * Nodes are removed only when reference counter goes to 0. + * When it's happened the node may be removed when a sufficient amount of + * time has been passed since its last use. The less-recently-used entry can + * also be removed if the pool is overloaded i.e. if the total amount of + * entries is greater-or-equal than the threshold. + * + * Node pool is organised as an AVL tree. + * Such an implementation has been chosen not just for fun. It's a way to + * prevent easy and efficient DoS attacks by creating hash collisions. A huge + * amount of long living nodes in a single hash slot would significantly delay + * lookups performed with disabled BHs. + * + * Serialisation issues. + * 1. Nodes may appear in the tree only with the pool lock held. + * 2. Nodes may disappear from the tree only with the pool lock held + * AND reference count being 0. + * 3. Global variable peer_total is modified under the pool lock. + * 4. struct inet_peer fields modification: + * avl_left, avl_right, avl_parent, avl_height: pool lock + * refcnt: atomically against modifications on other CPU; + * usually under some other lock to prevent node disappearing + * daddr: unchangeable + */ static struct kmem_cache *peer_cachep __read_mostly; @@ -410,10 +438,6 @@ relookup: p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, - (daddr->family == AF_INET) ? - secure_ip_id(daddr->addr.a4) : - secure_ipv6_id(daddr->addr.a6)); p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0d84bfdc..26b0c6a4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -112,7 +112,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->dst, sk); + ip_select_ident(skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -327,8 +327,7 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -1140,7 +1139,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ihl = 5; iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); + ip_select_ident(skb, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 92f646aa..fe9b71a9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1566,7 +1566,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(iph, skb_dst(skb), NULL); + ip_select_ident(skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 68b70cde..dda75f7b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -357,7 +357,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 57e115bd..e646f6fa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1200,36 +1200,23 @@ void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) rt->rt_peer_genid = rt_peer_genid(); } -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; - - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} +atomic_t *ip_idents __read_mostly; +EXPORT_SYMBOL(ip_idents); -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs) { - struct rtable *rt = (struct rtable *) dst; - - if (rt && !(rt->dst.flags & DST_NOPEER)) { - if (rt->peer == NULL) - rt_bind_peer(rt, rt->rt_dst, 1); + static u32 ip_idents_hashrnd __read_mostly; + static bool hashrnd_initialized = false; + u32 hash, id; - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); - return; - } - } else if (!rt) - printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", - __builtin_return_address(0)); + if (unlikely(!hashrnd_initialized)) { + hashrnd_initialized = true; + get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); + } - ip_select_fb_ident(iph); + hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); @@ -2738,7 +2725,6 @@ static int rt_fill_info(struct net *net, error = rt->dst.error; if (peer) { inet_peer_refcheck(rt->peer); - id = atomic_read(&peer->ip_id_count) & 0xffff; if (peer->tcp_ts_stamp) { ts = peer->tcp_ts; tsage = get_seconds() - peer->tcp_ts_stamp; @@ -3172,6 +3158,12 @@ int __init ip_rt_init(void) { int rc = 0; + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 235c1f4c..164c0820 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -50,12 +50,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(top_iph, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; + ip_select_ident(skb, NULL); return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fb5a9c2e..c99598d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -545,22 +545,17 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) { - static atomic_t ipv6_fragmentation_id; - int ident; + static u32 ip6_idents_hashrnd __read_mostly; + static bool hashrnd_initialized = false; + u32 hash, id; - if (rt && !(rt->dst.flags & DST_NOPEER)) { - struct inet_peer *peer; - - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - peer = rt->rt6i_peer; - if (peer) { - fhdr->identification = htonl(inet_getid(peer, 0)); - return; - } + if (unlikely(!hashrnd_initialized)) { + hashrnd_initialized = true; + get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); } - ident = atomic_inc_return(&ipv6_fragmentation_id); - fhdr->identification = htonl(ident); + hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); + id = ip_idents_reserve(hash, 1); + fhdr->identification = htonl(id); } int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 7fd66dec..559d3071 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -853,7 +853,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; |
