erofs: kill hooked chains to avoid loops on deduplicated compressed images
authorGao Xiang <hsiangkao@linux.alibaba.com>
Fri, 26 May 2023 20:14:56 +0000 (04:14 +0800)
committerGao Xiang <hsiangkao@linux.alibaba.com>
Mon, 29 May 2023 15:05:43 +0000 (23:05 +0800)
After heavily stressing EROFS with several images which include a
hand-crafted image of repeated patterns for more than 46 days, I found
two chains could be linked with each other almost simultaneously and
form a loop so that the entire loop won't be submitted.  As a
consequence, the corresponding file pages will remain locked forever.

It can be _only_ observed on data-deduplicated compressed images.
For example, consider two chains with five pclusters in total:
Chain 1:  2->3->4->5    -- The tail pcluster is 5;
        Chain 2:  5->1->2       -- The tail pcluster is 2.

Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link
to Chain 2 at the same time with pcluster 2.

Since hooked chains are all linked locklessly now, I have no idea how
to simply avoid the race.  Instead, let's avoid hooked chains completely
until I could work out a proper way to fix this and end users finally
tell us that it's needed to add it back.

Actually, this optimization can be found with multi-threaded workloads
(especially even more often on deduplicated compressed images), yet I'm
not sure about the overall system impacts of not having this compared
with implementation complexity.

Fixes: 267f2492c8f7 ("erofs: introduce multi-reference pclusters (fully-referenced)")
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-4-hsiangkao@linux.alibaba.com
fs/erofs/zdata.c

index 22be768a5b8a56ba14824f01b3b50e6520e4b679..d29552ea53fcde5481af80a4bdc0142ebbdf82fb 100644 (file)
@@ -93,11 +93,8 @@ struct z_erofs_pcluster {
 
 /* let's avoid the valid 32-bit kernel addresses */
 
-/* the chained workgroup has't submitted io (still open) */
+/* the end of a chain of pclusters */
 #define Z_EROFS_PCLUSTER_TAIL           ((void *)0x5F0ECAFE)
-/* the chained workgroup has already submitted io */
-#define Z_EROFS_PCLUSTER_TAIL_CLOSED    ((void *)0x5F0EDEAD)
-
 #define Z_EROFS_PCLUSTER_NIL            (NULL)
 
 struct z_erofs_decompressqueue {
@@ -504,20 +501,6 @@ out_error_pcluster_pool:
 
 enum z_erofs_pclustermode {
        Z_EROFS_PCLUSTER_INFLIGHT,
-       /*
-        * The current pclusters was the tail of an exist chain, in addition
-        * that the previous processed chained pclusters are all decided to
-        * be hooked up to it.
-        * A new chain will be created for the remaining pclusters which are
-        * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
-        * the next pcluster cannot reuse the whole page safely for inplace I/O
-        * in the following scenario:
-        *  ________________________________________________________________
-        * |      tail (partial) page     |       head (partial) page       |
-        * |   (belongs to the next pcl)  |   (belongs to the current pcl)  |
-        * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
-        */
-       Z_EROFS_PCLUSTER_HOOKED,
        /*
         * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
         * could be dispatched into bypass queue later due to uptodated managed
@@ -535,8 +518,8 @@ enum z_erofs_pclustermode {
         *  ________________________________________________________________
         * |  tail (partial) page |          head (partial) page           |
         * |  (of the current cl) |      (of the previous collection)      |
-        * | PCLUSTER_FOLLOWED or |                                        |
-        * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
+        * |                      |                                        |
+        * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
         *
         * [  (*) the above page can be used as inplace I/O.               ]
         */
@@ -550,7 +533,7 @@ struct z_erofs_decompress_frontend {
 
        struct page *pagepool;
        struct page *candidate_bvpage;
-       struct z_erofs_pcluster *pcl, *tailpcl;
+       struct z_erofs_pcluster *pcl;
        z_erofs_next_pcluster_t owned_head;
        enum z_erofs_pclustermode mode;
 
@@ -755,19 +738,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
                return;
        }
 
-       /*
-        * type 2, link to the end of an existing open chain, be careful
-        * that its submission is controlled by the original attached chain.
-        */
-       if (*owned_head != &pcl->next && pcl != f->tailpcl &&
-           cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
-                   *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
-               *owned_head = Z_EROFS_PCLUSTER_TAIL;
-               f->mode = Z_EROFS_PCLUSTER_HOOKED;
-               f->tailpcl = NULL;
-               return;
-       }
-       /* type 3, it belongs to a chain, but it isn't the end of the chain */
+       /* type 2, it belongs to an ongoing chain */
        f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
 }
 
@@ -828,9 +799,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
                        goto err_out;
                }
        }
-       /* used to check tail merging loop due to corrupted images */
-       if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
-               fe->tailpcl = pcl;
        fe->owned_head = &pcl->next;
        fe->pcl = pcl;
        return 0;
@@ -851,7 +819,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
 
        /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
        DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
-       DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
 
        if (!(map->m_flags & EROFS_MAP_META)) {
                grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -870,10 +837,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
 
        if (ret == -EEXIST) {
                mutex_lock(&fe->pcl->lock);
-               /* used to check tail merging loop due to corrupted images */
-               if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
-                       fe->tailpcl = fe->pcl;
-
                z_erofs_try_to_claim_pcluster(fe);
        } else if (ret) {
                return ret;
@@ -1028,8 +991,7 @@ hitted:
         * those chains are handled asynchronously thus the page cannot be used
         * for inplace I/O or bvpage (should be processed in a strict order.)
         */
-       tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
-                 fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+       tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
 
        cur = end - min_t(unsigned int, offset + end - map->m_la, end);
        if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1398,10 +1360,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
        };
        z_erofs_next_pcluster_t owned = io->head;
 
-       while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
-               /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
-               DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
-               /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
+       while (owned != Z_EROFS_PCLUSTER_TAIL) {
                DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
 
                be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1418,7 +1377,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
                container_of(work, struct z_erofs_decompressqueue, u.work);
        struct page *pagepool = NULL;
 
-       DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+       DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
        z_erofs_decompress_queue(bgq, &pagepool);
        erofs_release_pages(&pagepool);
        kvfree(bgq);
@@ -1606,7 +1565,7 @@ fg_out:
                q->sync = true;
        }
        q->sb = sb;
-       q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+       q->head = Z_EROFS_PCLUSTER_TAIL;
        return q;
 }
 
@@ -1624,11 +1583,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
        z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
        z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
 
-       DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
-       if (owned_head == Z_EROFS_PCLUSTER_TAIL)
-               owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
-
-       WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+       WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
 
        WRITE_ONCE(*submit_qtail, owned_head);
        WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1698,15 +1653,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
                unsigned int i = 0;
                bool bypass = true;
 
-               /* no possible 'owned_head' equals the following */
-               DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
                DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-
                pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+               owned_head = READ_ONCE(pcl->next);
 
-               /* close the main owned chain at first */
-               owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
-                                    Z_EROFS_PCLUSTER_TAIL_CLOSED);
                if (z_erofs_is_inline_pcluster(pcl)) {
                        move_to_bypass_jobqueue(pcl, qtail, owned_head);
                        continue;