block: fix the DISCARD request merge

[linux-2.6-block.git] / block / bfq-iosched.c
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 653100fb719eb80e1bb11e9ef7761f14f960623f..6075100f03a50a73da838b19891b923d0ad422a7 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -624,12 +624,13 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  }
  
  /*
- * Tell whether there are active queues or groups with differentiated weights.
+ * Tell whether there are active queues with different weights or
+ * active groups.
   */
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
  {
         /*
-        * For weights to differ, at least one of the trees must contain
+        * For queue weights to differ, queue_weights_tree must contain
          * at least two nodes.
          */
         return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
@@ -637,9 +638,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
                  bfqd->queue_weights_tree.rb_node->rb_right)
  #ifdef CONFIG_BFQ_GROUP_IOSCHED
                ) ||
-              (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
-               (bfqd->group_weights_tree.rb_node->rb_left ||
-                bfqd->group_weights_tree.rb_node->rb_right)
+               (bfqd->num_active_groups > 0
  #endif
                );
  }
@@ -657,26 +656,25 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
   * 3) all active groups at the same level in the groups tree have the same
   *    number of children.
   *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
+ * Unfortunately, keeping the necessary state for evaluating exactly
+ * the last two symmetry sub-conditions above would be quite complex
+ * and time consuming.  Therefore this function evaluates, instead,
+ * only the following stronger two sub-conditions, for which it is
+ * much easier to maintain the needed state:
   * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
+ * 2) there are no active groups.
+ * In particular, the last condition is always true if hierarchical
+ * support or the cgroups interface are not enabled, thus no state
+ * needs to be maintained in this case.
   */
  static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
  {
-       return !bfq_differentiated_weights(bfqd);
+       return !bfq_varied_queue_weights_or_active_groups(bfqd);
  }
  
  /*
   * If the weight-counter tree passed as input contains no counter for
- * the weight of the input entity, then add that counter; otherwise just
+ * the weight of the input queue, then add that counter; otherwise just
   * increment the existing counter.
   *
   * Note that weight-counter trees contain few nodes in mostly symmetric
@@ -687,25 +685,25 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
   * In most scenarios, the rate at which nodes are created/destroyed
   * should be low too.
   */
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                           struct rb_root *root)
  {
+       struct bfq_entity *entity = &bfqq->entity;
         struct rb_node **new = &(root->rb_node), *parent = NULL;
  
         /*
-        * Do not insert if the entity is already associated with a
+        * Do not insert if the queue is already associated with a
          * counter, which happens if:
-        *   1) the entity is associated with a queue,
-        *   2) a request arrival has caused the queue to become both
+        *   1) a request arrival has caused the queue to become both
          *      non-weight-raised, and hence change its weight, and
          *      backlogged; in this respect, each of the two events
          *      causes an invocation of this function,
-        *   3) this is the invocation of this function caused by the
+        *   2) this is the invocation of this function caused by the
          *      second event. This second invocation is actually useless,
          *      and we handle this fact by exiting immediately. More
          *      efficient or clearer solutions might possibly be adopted.
          */
-       if (entity->weight_counter)
+       if (bfqq->weight_counter)
                 return;
  
         while (*new) {
@@ -715,7 +713,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
                 parent = *new;
  
                 if (entity->weight == __counter->weight) {
-                       entity->weight_counter = __counter;
+                       bfqq->weight_counter = __counter;
                         goto inc_counter;
                 }
                 if (entity->weight < __counter->weight)
@@ -724,66 +722,67 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
                         new = &((*new)->rb_right);
         }
  
-       entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
-                                        GFP_ATOMIC);
+       bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+                                      GFP_ATOMIC);
  
         /*
          * In the unlucky event of an allocation failure, we just
-        * exit. This will cause the weight of entity to not be
-        * considered in bfq_differentiated_weights, which, in its
-        * turn, causes the scenario to be deemed wrongly symmetric in
-        * case entity's weight would have been the only weight making
-        * the scenario asymmetric. On the bright side, no unbalance
-        * will however occur when entity becomes inactive again (the
-        * invocation of this function is triggered by an activation
-        * of entity). In fact, bfq_weights_tree_remove does nothing
-        * if !entity->weight_counter.
+        * exit. This will cause the weight of queue to not be
+        * considered in bfq_varied_queue_weights_or_active_groups,
+        * which, in its turn, causes the scenario to be deemed
+        * wrongly symmetric in case bfqq's weight would have been
+        * the only weight making the scenario asymmetric.  On the
+        * bright side, no unbalance will however occur when bfqq
+        * becomes inactive again (the invocation of this function
+        * is triggered by an activation of queue).  In fact,
+        * bfq_weights_tree_remove does nothing if
+        * !bfqq->weight_counter.
          */
-       if (unlikely(!entity->weight_counter))
+       if (unlikely(!bfqq->weight_counter))
                 return;
  
-       entity->weight_counter->weight = entity->weight;
-       rb_link_node(&entity->weight_counter->weights_node, parent, new);
-       rb_insert_color(&entity->weight_counter->weights_node, root);
+       bfqq->weight_counter->weight = entity->weight;
+       rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
+       rb_insert_color(&bfqq->weight_counter->weights_node, root);
  
  inc_counter:
-       entity->weight_counter->num_active++;
+       bfqq->weight_counter->num_active++;
  }
  
  /*
- * Decrement the weight counter associated with the entity, and, if the
+ * Decrement the weight counter associated with the queue, and, if the
   * counter reaches 0, remove the counter from the tree.
   * See the comments to the function bfq_weights_tree_add() for considerations
   * about overhead.
   */
  void __bfq_weights_tree_remove(struct bfq_data *bfqd,
-                              struct bfq_entity *entity,
+                              struct bfq_queue *bfqq,
                                struct rb_root *root)
  {
-       if (!entity->weight_counter)
+       if (!bfqq->weight_counter)
                 return;
  
-       entity->weight_counter->num_active--;
-       if (entity->weight_counter->num_active > 0)
+       bfqq->weight_counter->num_active--;
+       if (bfqq->weight_counter->num_active > 0)
                 goto reset_entity_pointer;
  
-       rb_erase(&entity->weight_counter->weights_node, root);
-       kfree(entity->weight_counter);
+       rb_erase(&bfqq->weight_counter->weights_node, root);
+       kfree(bfqq->weight_counter);
  
  reset_entity_pointer:
-       entity->weight_counter = NULL;
+       bfqq->weight_counter = NULL;
  }
  
  /*
- * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
- * parent entities.
+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
+ * of active groups for each queue's inactive parent entity.
   */
  void bfq_weights_tree_remove(struct bfq_data *bfqd,
                              struct bfq_queue *bfqq)
  {
         struct bfq_entity *entity = bfqq->entity.parent;
  
-       __bfq_weights_tree_remove(bfqd, &bfqq->entity,
+       __bfq_weights_tree_remove(bfqd, bfqq,
                                   &bfqd->queue_weights_tree);
  
         for_each_entity(entity) {
@@ -797,17 +796,13 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
                          * next_in_service for details on why
                          * in_service_entity must be checked too).
                          *
-                        * As a consequence, the weight of entity is
-                        * not to be removed. In addition, if entity
-                        * is active, then its parent entities are
-                        * active as well, and thus their weights are
-                        * not to be removed either. In the end, this
-                        * loop must stop here.
+                        * As a consequence, its parent entities are
+                        * active as well, and thus this loop must
+                        * stop here.
                          */
                         break;
                 }
-               __bfq_weights_tree_remove(bfqd, entity,
-                                         &bfqd->group_weights_tree);
+               bfqd->num_active_groups--;
         }
  }
  
@@ -3182,6 +3177,13 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
                     jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
  }
  
+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
+{
+       return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+               blk_queue_nonrot(bfqq->bfqd->queue) &&
+               bfqq->bfqd->hw_tag;
+}
+
  /**
   * bfq_bfqq_expire - expire a queue.
   * @bfqd: device owning the queue.
@@ -3291,6 +3293,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
         if (ref == 1) /* bfqq is gone, no more actions on it */
                 return;
  
+       bfqq->injected_service = 0;
+
         /* mark bfqq as waiting a request only if a bic still points to it */
         if (!bfq_bfqq_busy(bfqq) &&
             reason != BFQQE_BUDGET_TIMEOUT &&
@@ -3497,9 +3501,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * symmetric scenario where:
          * (i)  each of these processes must get the same throughput as
          *      the others;
-        * (ii) all these processes have the same I/O pattern
-               (either sequential or random).
-        * In fact, in such a scenario, the drive will tend to treat
+        * (ii) the I/O of each process has the same properties, in
+        *      terms of locality (sequential or random), direction
+        *      (reads or writes), request sizes, greediness
+        *      (from I/O-bound to sporadic), and so on.
+        * In fact, in such a scenario, the drive tends to treat
          * the requests of each of these processes in about the same
          * way as the requests of the others, and thus to provide
          * each of these processes with about the same throughput
@@ -3508,18 +3514,50 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * certainly needed to guarantee that bfqq receives its
          * assigned fraction of the device throughput (see [1] for
          * details).
+        * The problem is that idling may significantly reduce
+        * throughput with certain combinations of types of I/O and
+        * devices. An important example is sync random I/O, on flash
+        * storage with command queueing. So, unless bfqq falls in the
+        * above cases where idling also boosts throughput, it would
+        * be important to check conditions (i) and (ii) accurately,
+        * so as to avoid idling when not strictly needed for service
+        * guarantees.
+        *
+        * Unfortunately, it is extremely difficult to thoroughly
+        * check condition (ii). And, in case there are active groups,
+        * it becomes very difficult to check condition (i) too. In
+        * fact, if there are active groups, then, for condition (i)
+        * to become false, it is enough that an active group contains
+        * more active processes or sub-groups than some other active
+        * group. We address this issue with the following bi-modal
+        * behavior, implemented in the function
+        * bfq_symmetric_scenario().
          *
-        * We address this issue by controlling, actually, only the
-        * symmetry sub-condition (i), i.e., provided that
-        * sub-condition (i) holds, idling is not performed,
-        * regardless of whether sub-condition (ii) holds. In other
-        * words, only if sub-condition (i) holds, then idling is
+        * If there are active groups, then the scenario is tagged as
+        * asymmetric, conservatively, without checking any of the
+        * conditions (i) and (ii). So the device is idled for bfqq.
+        * This behavior matches also the fact that groups are created
+        * exactly if controlling I/O (to preserve bandwidth and
+        * latency guarantees) is a primary concern.
+        *
+        * On the opposite end, if there are no active groups, then
+        * only condition (i) is actually controlled, i.e., provided
+        * that condition (i) holds, idling is not performed,
+        * regardless of whether condition (ii) holds. In other words,
+        * only if condition (i) does not hold, then idling is
          * allowed, and the device tends to be prevented from queueing
-        * many requests, possibly of several processes. The reason
-        * for not controlling also sub-condition (ii) is that we
-        * exploit preemption to preserve guarantees in case of
-        * symmetric scenarios, even if (ii) does not hold, as
-        * explained in the next two paragraphs.
+        * many requests, possibly of several processes. Since there
+        * are no active groups, then, to control condition (i) it is
+        * enough to check whether all active queues have the same
+        * weight.
+        *
+        * Not checking condition (ii) evidently exposes bfqq to the
+        * risk of getting less throughput than its fair share.
+        * However, for queues with the same weight, a further
+        * mechanism, preemption, mitigates or even eliminates this
+        * problem. And it does so without consequences on overall
+        * throughput. This mechanism and its benefits are explained
+        * in the next three paragraphs.
          *
          * Even if a queue, say Q, is expired when it remains idle, Q
          * can still preempt the new in-service queue if the next
@@ -3533,11 +3571,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * idling allows the internal queues of the device to contain
          * many requests, and thus to reorder requests, we can rather
          * safely assume that the internal scheduler still preserves a
-        * minimum of mid-term fairness. The motivation for using
-        * preemption instead of idling is that, by not idling,
-        * service guarantees are preserved without minimally
-        * sacrificing throughput. In other words, both a high
-        * throughput and its desired distribution are obtained.
+        * minimum of mid-term fairness.
          *
          * More precisely, this preemption-based, idleless approach
          * provides fairness in terms of IOPS, and not sectors per
@@ -3556,22 +3590,27 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * 1024/8 times as high as the service received by the other
          * queue.
          *
-        * On the other hand, device idling is performed, and thus
-        * pure sector-domain guarantees are provided, for the
-        * following queues, which are likely to need stronger
-        * throughput guarantees: weight-raised queues, and queues
-        * with a higher weight than other queues. When such queues
-        * are active, sub-condition (i) is false, which triggers
-        * device idling.
+        * The motivation for using preemption instead of idling (for
+        * queues with the same weight) is that, by not idling,
+        * service guarantees are preserved (completely or at least in
+        * part) without minimally sacrificing throughput. And, if
+        * there is no active group, then the primary expectation for
+        * this device is probably a high throughput.
          *
-        * According to the above considerations, the next variable is
-        * true (only) if sub-condition (i) holds. To compute the
-        * value of this variable, we not only use the return value of
-        * the function bfq_symmetric_scenario(), but also check
-        * whether bfqq is being weight-raised, because
-        * bfq_symmetric_scenario() does not take into account also
-        * weight-raised queues (see comments on
-        * bfq_weights_tree_add()).
+        * We are now left only with explaining the additional
+        * compound condition that is checked below for deciding
+        * whether the scenario is asymmetric. To explain this
+        * compound condition, we need to add that the function
+        * bfq_symmetric_scenario checks the weights of only
+        * non-weight-raised queues, for efficiency reasons (see
+        * comments on bfq_weights_tree_add()). Then the fact that
+        * bfqq is weight-raised is checked explicitly here. More
+        * precisely, the compound condition below takes into account
+        * also the fact that, even if bfqq is being weight-raised,
+        * the scenario is still symmetric if all active queues happen
+        * to be weight-raised. Actually, we should be even more
+        * precise here, and differentiate between interactive weight
+        * raising and soft real-time weight raising.
          *
          * As a side note, it is worth considering that the above
          * device-idling countermeasures may however fail in the
@@ -3583,7 +3622,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * to let requests be served in the desired order until all
          * the requests already queued in the device have been served.
          */
-       asymmetric_scenario = bfqq->wr_coeff > 1 ||
+       asymmetric_scenario = (bfqq->wr_coeff > 1 &&
+                              bfqd->wr_busy_queues < bfqd->busy_queues) ||
                 !bfq_symmetric_scenario(bfqd);
  
         /*
@@ -3629,6 +3669,30 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
         return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
  }
  
+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+{
+       struct bfq_queue *bfqq;
+
+       /*
+        * A linear search; but, with a high probability, very few
+        * steps are needed to find a candidate queue, i.e., a queue
+        * with enough budget left for its next request. In fact:
+        * - BFQ dynamically updates the budget of every queue so as
+        *   to accommodate the expected backlog of the queue;
+        * - if a queue gets all its requests dispatched as injected
+        *   service, then the queue is removed from the active list
+        *   (and re-added only if it gets new requests, but with
+        *   enough budget for its new backlog).
+        */
+       list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+               if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+                   bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+                   bfq_bfqq_budget_left(bfqq))
+                       return bfqq;
+
+       return NULL;
+}
+
  /*
   * Select a queue for service.  If we have a current queue in service,
   * check whether to continue servicing it, or retrieve and set a new one.
@@ -3710,10 +3774,19 @@ check_queue:
          * No requests pending. However, if the in-service queue is idling
          * for a new request, or has requests waiting for a completion and
          * may idle after their completion, then keep it anyway.
+        *
+        * Yet, to boost throughput, inject service from other queues if
+        * possible.
          */
         if (bfq_bfqq_wait_request(bfqq) ||
             (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
-               bfqq = NULL;
+               if (bfq_bfqq_injectable(bfqq) &&
+                   bfqq->injected_service * bfqq->inject_coeff <
+                   bfqq->entity.service * 10)
+                       bfqq = bfq_choose_bfqq_for_injection(bfqd);
+               else
+                       bfqq = NULL;
+
                 goto keep_queue;
         }
  
@@ -3803,6 +3876,14 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
  
         bfq_dispatch_remove(bfqd->queue, rq);
  
+       if (bfqq != bfqd->in_service_queue) {
+               if (likely(bfqd->in_service_queue))
+                       bfqd->in_service_queue->injected_service +=
+                               bfq_serv_to_charge(rq, bfqq);
+
+               goto return_rq;
+       }
+
         /*
          * If weight raising has to terminate for bfqq, then next
          * function causes an immediate update of bfqq's weight,
@@ -3821,13 +3902,12 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
          * belongs to CLASS_IDLE and other queues are waiting for
          * service.
          */
-       if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
-               goto expire;
-
-       return rq;
+       if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+               goto return_rq;
  
-expire:
         bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+
+return_rq:
         return rq;
  }
  
@@ -4232,6 +4312,13 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                         bfq_mark_bfqq_has_short_ttime(bfqq);
                 bfq_mark_bfqq_sync(bfqq);
                 bfq_mark_bfqq_just_created(bfqq);
+               /*
+                * Aggressively inject a lot of service: up to 90%.
+                * This coefficient remains constant during bfqq life,
+                * but this behavior might be changed, after enough
+                * testing and tuning.
+                */
+               bfqq->inject_coeff = 1;
         } else
                 bfq_clear_bfqq_sync(bfqq);
  
@@ -4297,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
  
         rcu_read_lock();
  
-       bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+       bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
         if (!bfqg) {
                 bfqq = &bfqd->oom_bfqq;
                 goto out;
@@ -5330,7 +5417,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
         bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
  
         bfqd->queue_weights_tree = RB_ROOT;
-       bfqd->group_weights_tree = RB_ROOT;
+       bfqd->num_active_groups = 0;
  
         INIT_LIST_HEAD(&bfqd->active_list);
         INIT_LIST_HEAD(&bfqd->idle_list);