net_sched: sch_fq: add dctcp-like marking
authorEric Dumazet <edumazet@google.com>
Sun, 11 Nov 2018 17:11:31 +0000 (09:11 -0800)
committerDavid S. Miller <davem@davemloft.net>
Sun, 11 Nov 2018 21:59:21 +0000 (13:59 -0800)
Similar to 80ba92fa1a92 ("codel: add ce_threshold attribute")

After EDT adoption, it became easier to implement DCTCP-like CE marking.

In many cases, queues are not building in the network fabric but on
the hosts themselves.

If packets leaving fq missed their Earliest Departure Time by XXX usec,
we mark them with ECN CE. This gives a feedback (after one RTT) to
the sender to slow down and find better operating mode.

Example :

tc qd replace dev eth0 root fq ce_threshold 2.5ms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/uapi/linux/pkt_sched.h
net/sched/sch_fq.c

index 89ee47c2f17d86fba9a37733b5593680ceefcf00..ee017bc057a3cb390f995329ec8ab5432a844557 100644 (file)
@@ -864,6 +864,8 @@ enum {
 
        TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
 
+       TCA_FQ_CE_THRESHOLD,    /* DCTCP-like CE-marking threshold */
+
        __TCA_FQ_MAX
 };
 
@@ -882,6 +884,7 @@ struct tc_fq_qd_stats {
        __u32   inactive_flows;
        __u32   throttled_flows;
        __u32   unthrottle_latency_ns;
+       __u64   ce_mark;                /* packets above ce_threshold */
 };
 
 /* Heavy-Hitter Filter */
index 4b1af706896c07e5a0fe6d542dfcd530acdcf8f5..3671eab91107d168062ab73ebb0640d44f94fc95 100644 (file)
@@ -94,6 +94,7 @@ struct fq_sched_data {
        u32             flow_refill_delay;
        u32             flow_plimit;    /* max packets per flow */
        unsigned long   flow_max_rate;  /* optional max rate per flow */
+       u64             ce_threshold;
        u32             orphan_mask;    /* mask for orphaned skb */
        u32             low_rate_threshold;
        struct rb_root  *fq_root;
@@ -107,6 +108,7 @@ struct fq_sched_data {
        u64             stat_gc_flows;
        u64             stat_internal_packets;
        u64             stat_throttled;
+       u64             stat_ce_mark;
        u64             stat_flows_plimit;
        u64             stat_pkts_too_long;
        u64             stat_allocation_errors;
@@ -454,6 +456,11 @@ begin:
                        fq_flow_set_throttled(q, f);
                        goto begin;
                }
+               if (time_next_packet &&
+                   (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+                       INET_ECN_set_ce(skb);
+                       q->stat_ce_mark++;
+               }
        }
 
        skb = fq_dequeue_head(sch, f);
@@ -650,6 +657,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
        [TCA_FQ_BUCKETS_LOG]            = { .type = NLA_U32 },
        [TCA_FQ_FLOW_REFILL_DELAY]      = { .type = NLA_U32 },
        [TCA_FQ_LOW_RATE_THRESHOLD]     = { .type = NLA_U32 },
+       [TCA_FQ_CE_THRESHOLD]           = { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -729,6 +737,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
        if (tb[TCA_FQ_ORPHAN_MASK])
                q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
+       if (tb[TCA_FQ_CE_THRESHOLD])
+               q->ce_threshold = (u64)NSEC_PER_USEC *
+                                 nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+
        if (!err) {
                sch_tree_unlock(sch);
                err = fq_resize(sch, fq_log);
@@ -779,6 +791,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
        q->fq_trees_log         = ilog2(1024);
        q->orphan_mask          = 1024 - 1;
        q->low_rate_threshold   = 550000 / 8;
+
+       /* Default ce_threshold of 4294 seconds */
+       q->ce_threshold         = (u64)NSEC_PER_USEC * ~0U;
+
        qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
 
        if (opt)
@@ -792,6 +808,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
        struct fq_sched_data *q = qdisc_priv(sch);
+       u64 ce_threshold = q->ce_threshold;
        struct nlattr *opts;
 
        opts = nla_nest_start(skb, TCA_OPTIONS);
@@ -800,6 +817,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
        /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
 
+       do_div(ce_threshold, NSEC_PER_USEC);
+
        if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
            nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
            nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
@@ -812,6 +831,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
            nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
            nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
                        q->low_rate_threshold) ||
+           nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
            nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
                goto nla_put_failure;
 
@@ -841,6 +861,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
        st.throttled_flows        = q->throttled_flows;
        st.unthrottle_latency_ns  = min_t(unsigned long,
                                          q->unthrottle_latency_ns, ~0U);
+       st.ce_mark                = q->stat_ce_mark;
        sch_tree_unlock(sch);
 
        return gnet_stats_copy_app(d, &st, sizeof(st));