/* * net/sched/sch_api.c Packet scheduler API. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, * * Fixes: * * Rani Assaf :980802: JIFFIES and CPU clock sources are repaired. * Eduardo J. Blanco :990222: kmod support * Jamal Hadi Salim : 990601: ingress support */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new); static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, int event); static DEFINE_RWLOCK(qdisc_mod_lock); static struct Qdisc_ops *qdisc_base; int register_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int rc = -EEXIST; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (!strcmp(qops->id, q->id)) goto out; if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; if (qops->peek == NULL) { if (qops->dequeue == NULL) qops->peek = noop_qdisc_ops.peek; else goto out_einval; } if (qops->dequeue == NULL) qops->dequeue = noop_qdisc_ops.dequeue; if (qops->cl_ops) { const struct Qdisc_class_ops *cops = qops->cl_ops; if (!(cops->get && cops->put && cops->walk && cops->leaf)) goto out_einval; if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf)) goto out_einval; } qops->next = NULL; *qp = qops; rc = 0; out: write_unlock(&qdisc_mod_lock); return rc; out_einval: rc = -EINVAL; goto out; } EXPORT_SYMBOL(register_qdisc); int unregister_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int err = -ENOENT; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (q == qops) break; if (q) { *qp = q->next; q->next = NULL; err = 0; } write_unlock(&qdisc_mod_lock); return err; } EXPORT_SYMBOL(unregister_qdisc); static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; if (!(root->flags & TCQ_F_BUILTIN) && root->handle == handle) return root; list_for_each_entry(q, &root->list, list) { if (q->handle == handle) return q; } return NULL; } static void qdisc_list_add(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list); } void qdisc_list_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) list_del(&q->list); } EXPORT_SYMBOL(qdisc_list_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; q = qdisc_match_from_root(dev->qdisc, handle); if (q) goto out; if (dev_ingress_queue(dev)) q = qdisc_match_from_root( dev_ingress_queue(dev)->qdisc_sleeping, handle); out: return q; } static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; struct Qdisc *leaf; const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) return NULL; cl = cops->get(p, classid); if (cl == 0) return NULL; leaf = cops->leaf(p, cl); cops->put(p, cl); return leaf; } static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { if (!try_module_get(q->owner)) q = NULL; break; } } read_unlock(&qdisc_mod_lock); } return q; } static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) { struct qdisc_rate_table *rtab; for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { rtab->refcnt++; return rtab; } } if (tab == NULL || r->rate == 0 || r->cell_log == 0 || nla_len(tab) != TC_RTAB_SIZE) return NULL; rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } return rtab; } EXPORT_SYMBOL(qdisc_get_rtab); void qdisc_put_rtab(struct qdisc_rate_table *tab) { struct qdisc_rate_table *rtab, **rtabp; if (!tab || --tab->refcnt) return; for (rtabp = &qdisc_rtab_list; (rtab = *rtabp) != NULL; rtabp = &rtab->next) { if (rtab == tab) { *rtabp = rtab->next; kfree(rtab); return; } } } EXPORT_SYMBOL(qdisc_put_rtab); static LIST_HEAD(qdisc_stab_list); static DEFINE_SPINLOCK(qdisc_stab_lock); static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, [TCA_STAB_DATA] = { .type = NLA_BINARY }, }; static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) { struct nlattr *tb[TCA_STAB_MAX + 1]; struct qdisc_size_table *stab; struct tc_sizespec *s; unsigned int tsize = 0; u16 *tab = NULL; int err; err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy); if (err < 0) return ERR_PTR(err); if (!tb[TCA_STAB_BASE]) return ERR_PTR(-EINVAL); s = nla_data(tb[TCA_STAB_BASE]); if (s->tsize > 0) { if (!tb[TCA_STAB_DATA]) return ERR_PTR(-EINVAL); tab = nla_data(tb[TCA_STAB_DATA]); tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } if (tsize != s->tsize || (!tab && tsize > 0)) return ERR_PTR(-EINVAL); spin_lock(&qdisc_stab_lock); list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) continue; stab->refcnt++; spin_unlock(&qdisc_stab_lock); return stab; } spin_unlock(&qdisc_stab_lock); stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) memcpy(stab->data, tab, tsize * sizeof(u16)); spin_lock(&qdisc_stab_lock); list_add_tail(&stab->list, &qdisc_stab_list); spin_unlock(&qdisc_stab_lock); return stab; } static void stab_kfree_rcu(struct rcu_head *head) { kfree(container_of(head, struct qdisc_size_table, rcu)); } void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) return; spin_lock(&qdisc_stab_lock); if (--tab->refcnt == 0) { list_del(&tab->list); call_rcu_bh(&tab->rcu, stab_kfree_rcu); } spin_unlock(&qdisc_stab_lock); } EXPORT_SYMBOL(qdisc_put_stab); static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) { struct nlattr *nest; nest = nla_nest_start(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts); nla_nest_end(skb, nest); return skb->len; nla_put_failure: return -1; } void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) { int pkt_len, slot; pkt_len = skb->len + stab->szopts.overhead; if (unlikely(!stab->szopts.tsize)) goto out; slot = pkt_len + stab->szopts.cell_align; if (unlikely(slot < 0)) slot = 0; slot >>= stab->szopts.cell_log; if (likely(slot < stab->szopts.tsize)) pkt_len = stab->data[slot]; else pkt_len = stab->data[stab->szopts.tsize - 1] * (slot / stab->szopts.tsize) + stab->data[slot % stab->szopts.tsize]; pkt_len <<= stab->szopts.size_log; out: if (unlikely(pkt_len < 1)) pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", txt, qdisc->ops->id, qdisc->handle >> 16); qdisc->flags |= TCQ_F_WARN_NONWC; } } EXPORT_SYMBOL(qdisc_warn_nonwc); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); return HRTIMER_NORESTART; } void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) { hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { ktime_t time; if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; qdisc_throttled(wd->qdisc); time = ktime_set(0, 0); time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); } EXPORT_SYMBOL(qdisc_watchdog_schedule); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); qdisc_unthrottled(wd->qdisc); } EXPORT_SYMBOL(qdisc_watchdog_cancel); static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) { unsigned int size = n * sizeof(struct hlist_head), i; struct hlist_head *h; if (size <= PAGE_SIZE) h = kmalloc(size, GFP_KERNEL); else h = (struct hlist_head *) __get_free_pages(GFP_KERNEL, get_order(size)); if (h != NULL) { for (i = 0; i < n; i++) INIT_HLIST_HEAD(&h[i]); } return h; } static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n) { unsigned int size = n * sizeof(struct hlist_head); if (size <= PAGE_SIZE) kfree(h); else free_pages((unsigned long)h, get_order(size)); } void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; struct hlist_node *n, *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; if (clhash->hashelems * 4 <= clhash->hashsize * 3) return; nsize = clhash->hashsize * 2; nmask = nsize - 1; nhash = qdisc_class_hash_alloc(nsize); if (nhash == NULL) return; ohash = clhash->hash; osize = clhash->hashsize; sch_tree_lock(sch); for (i = 0; i < osize; i++) { hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } } clhash->hash = nhash; clhash->hashsize = nsize; clhash->hashmask = nmask; sch_tree_unlock(sch); qdisc_class_hash_free(ohash, osize); } EXPORT_SYMBOL(qdisc_class_hash_grow); int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) { unsigned int size = 4; clhash->hash = qdisc_class_hash_alloc(size); if (clhash->hash == NULL) return -ENOMEM; clhash->hashsize = size; clhash->hashmask = size - 1; clhash->hashelems = 0; return 0; } EXPORT_SYMBOL(qdisc_class_hash_init); void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) { qdisc_class_hash_free(clhash->hash, clhash->hashsize); } EXPORT_SYMBOL(qdisc_class_hash_destroy); void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { unsigned int h; INIT_HLIST_NODE(&cl->hnode); h = qdisc_class_hash(cl->classid, clhash->hashmask); hlist_add_head(&cl->hnode, &clhash->hash[h]); clhash->hashelems++; } EXPORT_SYMBOL(qdisc_class_hash_insert); void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { hlist_del(&cl->hnode); clhash->hashelems--; } EXPORT_SYMBOL(qdisc_class_hash_remove); static u32 qdisc_alloc_handle(struct net_device *dev) { int i = 0x8000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); do { autohandle += TC_H_MAKE(0x10000U, 0); if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) autohandle = TC_H_MAKE(0x80000000U, 0); if (!qdisc_lookup(dev, autohandle)) return autohandle; cond_resched(); } while (--i > 0); return 0; } void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) { const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; if (n == 0) return; while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) return; sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON(parentid != TC_H_ROOT); return; } cops = sch->ops->cl_ops; if (cops->qlen_notify) { cl = cops->get(sch, parentid); cops->qlen_notify(sch, cl); cops->put(sch, cl); } sch->q.qlen -= n; } } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new) { if (new || old) qdisc_notify(net, skb, n, clid, old, new); if (old) qdisc_destroy(old); } static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb, struct nlmsghdr *n, u32 classid, struct Qdisc *new, struct Qdisc *old) { struct Qdisc *q = old; struct net *net = dev_net(dev); int err = 0; if (parent == NULL) { unsigned int i, num_q, ingress; ingress = 0; num_q = dev->num_tx_queues; if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) { num_q = 1; ingress = 1; if (!dev_ingress_queue(dev)) return -ENOENT; } if (dev->flags & IFF_UP) dev_deactivate(dev); if (new && new->ops->attach) { new->ops->attach(new); num_q = 0; } for (i = 0; i < num_q; i++) { struct netdev_queue *dev_queue = dev_ingress_queue(dev); if (!ingress) dev_queue = netdev_get_tx_queue(dev, i); old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) atomic_inc(&new->refcnt); if (!ingress) qdisc_destroy(old); } if (!ingress) { notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) atomic_inc(&new->refcnt); dev->qdisc = new ? : &noop_qdisc; } else { notify_and_destroy(net, skb, n, classid, old, new); } if (dev->flags & IFF_UP) dev_activate(dev); } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; err = -EOPNOTSUPP; if (cops && cops->graft) { unsigned long cl = cops->get(parent, classid); if (cl) { err = cops->graft(parent, cl, new, &old); cops->put(parent, cl); } else err = -ENOENT; } if (!err) notify_and_destroy(net, skb, n, classid, old, new); } return err; } static struct lock_class_key qdisc_tx_lock; static struct lock_class_key qdisc_rx_lock; static struct Qdisc * qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, struct Qdisc *p, u32 parent, u32 handle, struct nlattr **tca, int *errp) { int err; struct nlattr *kind = tca[TCA_KIND]; struct Qdisc *sch; struct Qdisc_ops *ops; struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { rtnl_unlock(); request_module("sch_%s", name); rtnl_lock(); ops = qdisc_lookup_ops(kind); if (ops != NULL) { module_put(ops->owner); err = -EAGAIN; goto err_out; } } } #endif err = -ENOENT; if (ops == NULL) goto err_out; sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) { err = PTR_ERR(sch); goto err_out2; } sch->parent = parent; if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; handle = TC_H_MAKE(TC_H_INGRESS, 0); lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); err = -ENOMEM; if (handle == 0) goto err_out3; } lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); } sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { err = PTR_ERR(stab); goto err_out4; } rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { spinlock_t *root_lock; err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) goto err_out4; if ((sch->parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS) && (!p || !(p->flags & TCQ_F_MQROOT))) root_lock = qdisc_root_sleeping_lock(sch); else root_lock = qdisc_lock(sch); err = gen_new_estimator(&sch->bstats, &sch->rate_est, root_lock, tca[TCA_RATE]); if (err) goto err_out4; } qdisc_list_add(sch); return sch; } err_out3: dev_put(dev); kfree((char *) sch - sch->padded); err_out2: module_put(ops->owner); err_out: *errp = err; return NULL; err_out4: qdisc_put_stab(rtnl_dereference(sch->stab)); if (ops->destroy) ops->destroy(sch); goto err_out3; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) { struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { if (sch->ops->change == NULL) return -EINVAL; err = sch->ops->change(sch, tca[TCA_OPTIONS]); if (err) return err; } if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) return PTR_ERR(stab); } ostab = rtnl_dereference(sch->stab); rcu_assign_pointer(sch->stab, stab); qdisc_put_stab(ostab); if (tca[TCA_RATE]) { if (sch->flags & TCQ_F_MQROOT) goto out; gen_replace_estimator(&sch->bstats, &sch->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); } out: return 0; } struct check_loop_arg { struct qdisc_walker w; struct Qdisc *p; int depth; }; static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) { struct check_loop_arg arg; if (q->ops->cl_ops == NULL) return 0; arg.w.stop = arg.w.skip = arg.w.count = 0; arg.w.fn = check_loop_fn; arg.depth = depth; arg.p = p; q->ops->cl_ops->walk(q, &arg.w); return arg.w.stop ? -ELOOP : 0; } static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct Qdisc *leaf; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct check_loop_arg *arg = (struct check_loop_arg *)w; leaf = cops->leaf(q, cl); if (leaf) { if (leaf == arg->p || arg->depth > 7) return -ELOOP; return check_loop(leaf, arg->p, arg->depth + 1); } return 0; } static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = NLMSG_DATA(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid = tcm->tcm_parent; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) return -ENOENT; q = qdisc_leaf(p, clid); } else if (dev_ingress_queue(dev)) { q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { q = dev->qdisc; } if (!q) return -ENOENT; if (tcm->tcm_handle && q->handle != tcm->tcm_handle) return -EINVAL; } else { q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) return -ENOENT; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (n->nlmsg_type == RTM_DELQDISC) { if (!clid) return -EINVAL; if (q->handle == 0) return -ENOENT; err = qdisc_graft(dev, p, skb, n, clid, NULL, q); if (err != 0) return err; } else { qdisc_notify(net, skb, n, clid, NULL, q); } return 0; } static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q, *p; int err; replay: tcm = NLMSG_DATA(n); clid = tcm->tcm_parent; q = p = NULL; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) return -ENOENT; q = qdisc_leaf(p, clid); } else if (dev_ingress_queue_create(dev)) { q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { q = dev->qdisc; } if (q && q->handle == 0) q = NULL; if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) return -EEXIST; if (TC_H_MIN(tcm->tcm_handle)) return -EINVAL; q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (q == p || (p && check_loop(q, p, 0))) return -ELOOP; atomic_inc(&q->refcnt); goto graft; } else { if (!q) goto create_n_graft; if ((n->nlmsg_flags & NLM_F_CREATE) && (n->nlmsg_flags & NLM_F_REPLACE) && ((n->nlmsg_flags & NLM_F_EXCL) || (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)))) goto create_n_graft; } } } else { if (!tcm->tcm_handle) return -EINVAL; q = qdisc_lookup(dev, tcm->tcm_handle); } if (q == NULL) return -ENOENT; if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; err = qdisc_change(q, tca); if (err == 0) qdisc_notify(net, skb, n, clid, NULL, q); return err; create_n_graft: if (!(n->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) q = qdisc_create(dev, dev_ingress_queue(dev), p, tcm->tcm_parent, tcm->tcm_parent, tca, &err); else err = -ENOENT; } else { struct netdev_queue *dev_queue; if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) dev_queue = p->ops->cl_ops->select_queue(p, tcm); else if (p) dev_queue = p->dev_queue; else dev_queue = netdev_get_tx_queue(dev, 0); q = qdisc_create(dev, dev_queue, p, tcm->tcm_parent, tcm->tcm_handle, tca, &err); } if (q == NULL) { if (err == -EAGAIN) goto replay; return err; } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL); if (err) { if (q) qdisc_destroy(q); return err; } return 0; } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 pid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); tcm = NLMSG_DATA(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = atomic_read(&q->refcnt); NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; q->qstats.qlen = q->q.qlen; stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, qdisc_root_sleeping_lock(q), &d) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, &q->qstats) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: nla_put_failure: nlmsg_trim(skb, b); return -1; } static bool tc_qdisc_dump_ignore(struct Qdisc *q) { return (q->flags & TCQ_F_BUILTIN) ? true : false; } static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new) { struct sk_buff *skb; u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old)) { if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new)) { if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; } if (skb->len) return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; if (!root) return 0; q = root; if (q_idx < s_q_idx) { q_idx++; } else { if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; q_idx++; } list_for_each_entry(q, &root->list, list) { if (q_idx < s_q_idx) { q_idx++; continue; } if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; q_idx++; } out: *q_idx_p = q_idx; return ret; done: ret = -1; goto out; } static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; rcu_read_lock(); idx = 0; for_each_netdev_rcu(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) goto cont; if (idx > s_idx) s_q_idx = 0; q_idx = 0; if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0) goto done; cont: idx++; } done: rcu_read_unlock(); cb->args[0] = idx; cb->args[1] = q_idx; return skb->len; } static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = NLMSG_DATA(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; u32 pid = tcm->tcm_parent; u32 clid = tcm->tcm_handle; u32 qid = TC_H_MAJ(clid); int err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; if (pid != TC_H_ROOT) { u32 qid1 = TC_H_MAJ(pid); if (qid && qid1) { if (qid != qid1) return -EINVAL; } else if (qid1) { qid = qid1; } else if (qid == 0) qid = dev->qdisc->handle; if (pid) pid = TC_H_MAKE(qid, pid); } else { if (qid == 0) qid = dev->qdisc->handle; } q = qdisc_lookup(dev, qid); if (!q) return -ENOENT; cops = q->ops->cl_ops; if (cops == NULL) return -EINVAL; if (clid == 0) { if (pid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); if (clid) cl = cops->get(q, clid); if (cl == 0) { err = -ENOENT; if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags & NLM_F_CREATE)) goto out; } else { switch (n->nlmsg_type) { case RTM_NEWTCLASS: err = -EEXIST; if (n->nlmsg_flags & NLM_F_EXCL) goto out; break; case RTM_DELTCLASS: err = -EOPNOTSUPP; if (cops->delete) err = cops->delete(q, cl); if (err == 0) tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); goto out; case RTM_GETTCLASS: err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); goto out; default: err = -EINVAL; goto out; } } new_cl = cl; err = -EOPNOTSUPP; if (cops->change) err = cops->change(q, clid, pid, tca, &new_cl); if (err == 0) tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); out: if (cl) cops->put(q, cl); return err; } static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, u32 pid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); tcm = NLMSG_DATA(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, qdisc_root_sleeping_lock(q), &d) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, int event) { struct sk_buff *skb; u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; struct netlink_callback *cb; }; static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t) { struct qdisc_dump_args arg; if (tc_qdisc_dump_ignore(q) || *t_p < s_t || !q->ops->cl_ops || (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)) { (*t_p)++; return 0; } if (*t_p > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); arg.w.fn = qdisc_class_dump; arg.skb = skb; arg.cb = cb; arg.w.stop = 0; arg.w.skip = cb->args[1]; arg.w.count = 0; q->ops->cl_ops->walk(q, &arg.w); cb->args[1] = arg.w.count; if (arg.w.stop) return -1; (*t_p)++; return 0; } static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t) { struct Qdisc *q; if (!root) return 0; if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; list_for_each_entry(q, &root->list, list) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } return 0; } static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; struct net_device *dev; int t, s_t; if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return 0; dev = dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return 0; s_t = cb->args[0]; t = 0; if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0) goto done; done: cb->args[0] = t; dev_put(dev); return skb->len; } int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { __be16 protocol = skb->protocol; int err; for (; tp; tp = tp->next) { if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; err = tp->classify(skb, tp, res); if (err >= 0) { #ifdef CONFIG_NET_CLS_ACT if (err != TC_ACT_RECLASSIFY && skb->tc_verd) skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); #endif return err; } } return -1; } EXPORT_SYMBOL(tc_classify_compat); int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int err = 0; #ifdef CONFIG_NET_CLS_ACT const struct tcf_proto *otp = tp; reclassify: #endif err = tc_classify_compat(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (err == TC_ACT_RECLASSIFY) { u32 verd = G_TC_VERD(skb->tc_verd); tp = otp; if (verd++ >= MAX_REC_LOOP) { if (net_ratelimit()) pr_notice("%s: packet reclassify loop" " rule prio %u protocol %02x\n", tp->q->ops->id, tp->prio & 0xffff, ntohs(tp->protocol)); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); goto reclassify; } #endif return err; } EXPORT_SYMBOL(tc_classify); void tcf_destroy(struct tcf_proto *tp) { tp->ops->destroy(tp); module_put(tp->ops->owner); kfree(tp); } void tcf_destroy_chain(struct tcf_proto **fl) { struct tcf_proto *tp; while ((tp = *fl) != NULL) { *fl = tp->next; tcf_destroy(tp); } } EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { struct timespec ts; hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); return 0; } static int psched_open(struct inode *inode, struct file *file) { return single_open(file, psched_show, NULL); } static const struct file_operations psched_fops = { .owner = THIS_MODULE, .open = psched_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; e = proc_net_fops_create(net, "psched", 0, &psched_fops); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit psched_net_exit(struct net *net) { proc_net_remove(net, "psched"); } #else static int __net_init psched_net_init(struct net *net) { return 0; } static void __net_exit psched_net_exit(struct net *net) { } #endif static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, }; static int __init pktsched_init(void) { int err; err = register_pernet_subsys(&psched_net_ops); if (err) { pr_err("pktsched_init: " "cannot initialize per netns operations\n"); return err; } register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL); rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL); return 0; } subsys_initcall(pktsched_init);