drbd: Clarify when activity log I/O is delegated to the worker thread
[linux-2.6-block.git] / drivers / block / drbd / drbd_actlog.c
index 92510f8ad0131f480aac0fddc48d8e889f8d690f..82199d9a9a61fed987661d7b7cbae8bac7510156 100644 (file)
@@ -104,7 +104,7 @@ struct update_al_work {
        int err;
 };
 
-static int al_write_transaction(struct drbd_conf *mdev);
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
 
 void *drbd_md_get_buffer(struct drbd_conf *mdev)
 {
@@ -168,7 +168,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
        bio->bi_end_io = drbd_md_io_complete;
        bio->bi_rw = rw;
 
-       if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
+       if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
+               /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+               ;
+       else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
+               /* Corresponding put_ldev in drbd_md_io_complete() */
                dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
                err = -ENODEV;
                goto out;
@@ -199,9 +203,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 
        BUG_ON(!bdev->md_bdev);
 
-       dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+       dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
             current->comm, current->pid, __func__,
-            (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+            (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+            (void*)_RET_IP_ );
 
        if (sector < drbd_md_first_sector(bdev) ||
            sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +214,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
                     current->comm, current->pid, __func__,
                     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
-       err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+       /* we do all our meta data IO in aligned 4k blocks. */
+       err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
        if (err) {
                dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
                    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -240,7 +246,10 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
        return al_ext;
 }
 
-void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
 {
        /* for bios crossing activity log extent boundaries,
         * we may need to activate two extents in one go */
@@ -249,6 +258,17 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
        unsigned enr;
        bool locked = false;
 
+       /* When called through generic_make_request(), we must delegate
+        * activity log I/O to the worker thread: a further request
+        * submitted via generic_make_request() within the same task
+        * would be queued on current->bio_list, and would only start
+        * after this function returns (see generic_make_request()).
+        *
+        * However, if we *are* the worker, we must not delegate to ourselves.
+        */
+
+       if (delegate)
+               BUG_ON(current == mdev->tconn->worker.task);
 
        D_ASSERT(first <= last);
        D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
@@ -264,13 +284,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
                        (locked = lc_try_lock_for_transaction(mdev->act_log)));
 
        if (locked) {
-               /* drbd_al_write_transaction(mdev,al_ext,enr);
-                * recurses into generic_make_request(), which
-                * disallows recursion, bios being serialized on the
-                * current->bio_tail list now.
-                * we have to delegate updates to the activity log
-                * to the worker thread. */
-
                /* Double check: it may have been committed by someone else,
                 * while we have been waiting for the lock. */
                if (mdev->act_log->pending_changes) {
@@ -281,7 +294,7 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
                        rcu_read_unlock();
 
                        if (write_al_updates) {
-                               al_write_transaction(mdev);
+                               al_write_transaction(mdev, delegate);
                                mdev->al_writ_cnt++;
                        }
 
@@ -350,6 +363,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
                 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
 }
 
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
+{
+       const unsigned int stripes = mdev->ldev->md.al_stripes;
+       const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
+
+       /* transaction number, modulo on-disk ring buffer wrap around */
+       unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
+
+       /* ... to aligned 4k on disk block */
+       t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+       /* ... to 512 byte sector in activity log */
+       t *= 8;
+
+       /* ... plus offset to the on disk position */
+       return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
+}
+
 static int
 _al_write_transaction(struct drbd_conf *mdev)
 {
@@ -432,13 +463,12 @@ _al_write_transaction(struct drbd_conf *mdev)
        if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
                mdev->al_tr_cycle = 0;
 
-       sector =  mdev->ldev->md.md_offset
-               + mdev->ldev->md.al_offset
-               + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
+       sector = al_tr_number_to_on_disk_sector(mdev);
 
        crc = crc32c(0, buffer, 4096);
        buffer->crc32c = cpu_to_be32(crc);
 
+       /* normal execution path goes through all three branches */
        if (drbd_bm_write_hinted(mdev))
                err = -EIO;
                /* drbd_chk_io_error done already */
@@ -446,8 +476,6 @@ _al_write_transaction(struct drbd_conf *mdev)
                err = -EIO;
                drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
        } else {
-               /* advance ringbuffer position and transaction counter */
-               mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
                mdev->al_tr_number++;
        }
 
@@ -474,20 +502,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
 /* Calls from worker context (see w_restart_disk_io()) need to write the
    transaction directly. Others came through generic_make_request(),
    those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_conf *mdev)
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
 {
-       struct update_al_work al_work;
-
-       if (current == mdev->tconn->worker.task)
+       if (delegate) {
+               struct update_al_work al_work;
+               init_completion(&al_work.event);
+               al_work.w.cb = w_al_write_transaction;
+               al_work.w.mdev = mdev;
+               drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+               wait_for_completion(&al_work.event);
+               return al_work.err;
+       } else
                return _al_write_transaction(mdev);
-
-       init_completion(&al_work.event);
-       al_work.w.cb = w_al_write_transaction;
-       al_work.w.mdev = mdev;
-       drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
-       wait_for_completion(&al_work.event);
-
-       return al_work.err;
 }
 
 static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)