btrfs: introduce RAID1 round-robin read balancing
authorAnand Jain <anand.jain@oracle.com>
Wed, 1 Jan 2025 18:06:35 +0000 (02:06 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 13 Jan 2025 13:53:21 +0000 (14:53 +0100)
Add round-robin read policy that balances reads over available devices
(all RAID1 block group profiles). Switch to the next devices is done
after a number of blocks is read, which is 256K by default and is
configurable in sysfs.

The format is "round-robin:<min-contig-read>" and can be set in file

  /sys/fs/btrfs/FSID/read_policy

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/sysfs.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 2880407d0dd30be6730614bd65da6a48a7b402ef..e155b7ce1ee58460f865e190bb8befa45ed29414 100644 (file)
@@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char *btrfs_read_policy_name[] = {
+       "pid",
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+       "round-robin",
+#endif
+};
 
 static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
 {
@@ -1355,6 +1360,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 
                ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+               if (i == BTRFS_READ_POLICY_RR)
+                       ret += sysfs_emit_at(buf, ret, ":%u",
+                                            READ_ONCE(fs_devices->rr_min_contig_read));
+#endif
+
                if (i == policy)
                        ret += sysfs_emit_at(buf, ret, "]");
        }
@@ -1376,6 +1387,41 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
        if (index < 0)
                return -EINVAL;
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+       /* If moving from RR then disable collecting fs stats. */
+       if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR)
+               fs_devices->collect_fs_stats = false;
+
+       if (index == BTRFS_READ_POLICY_RR) {
+               if (value != -1) {
+                       const u32 sectorsize = fs_devices->fs_info->sectorsize;
+
+                       if (!IS_ALIGNED(value, sectorsize)) {
+                               u64 temp_value = round_up(value, sectorsize);
+
+                               btrfs_debug(fs_devices->fs_info,
+"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu",
+                                         value, sectorsize, temp_value);
+                               value = temp_value;
+                       }
+               } else {
+                       value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+               }
+
+               if (index != READ_ONCE(fs_devices->read_policy) ||
+                   value != READ_ONCE(fs_devices->rr_min_contig_read)) {
+                       WRITE_ONCE(fs_devices->read_policy, index);
+                       WRITE_ONCE(fs_devices->rr_min_contig_read, value);
+
+                       btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+                                  btrfs_read_policy_name[index], value);
+               }
+
+               fs_devices->collect_fs_stats = true;
+
+               return len;
+       }
+#endif
        if (index != READ_ONCE(fs_devices->read_policy)) {
                WRITE_ONCE(fs_devices->read_policy, index);
                btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
index e5d5cfb2d23965b27165a130cdbc9bc62e51b01c..cfe1d5ada5f248e68be1b6bf86d72be044006782 100644 (file)
@@ -1329,6 +1329,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
        fs_devices->total_rw_bytes = 0;
        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
        fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+       fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+#endif
 
        return 0;
 }
@@ -5953,6 +5956,63 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
        return len;
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+struct stripe_mirror {
+       u64 devid;
+       int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+       const struct stripe_mirror *s1 = (const struct stripe_mirror *)a;
+       const struct stripe_mirror *s2 = (const struct stripe_mirror *)b;
+
+       if (s1->devid < s2->devid)
+               return -1;
+       if (s1->devid > s2->devid)
+               return 1;
+       return 0;
+}
+
+/*
+ * Select a stripe for reading using the round-robin algorithm.
+ *
+ *  1. Compute the read cycle as the total sectors read divided by the minimum
+ *     sectors per device.
+ *  2. Determine the stripe number for the current read by taking the modulus
+ *     of the read cycle with the total number of stripes:
+ *
+ *      stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+       struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 };
+       struct btrfs_device *device  = map->stripes[first].dev;
+       struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
+       unsigned int read_cycle;
+       unsigned int total_reads;
+       unsigned int min_reads_per_dev;
+
+       total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
+       min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
+                                                      fs_info->sectorsize_bits;
+
+       for (int index = 0, i = first; i < first + num_stripes; i++) {
+               stripes[index].devid = map->stripes[i].dev->devid;
+               stripes[index].num = i;
+               index++;
+       }
+       sort(stripes, num_stripes, sizeof(struct stripe_mirror),
+            btrfs_cmp_devid, NULL);
+
+       read_cycle = total_reads / min_reads_per_dev;
+       return stripes[read_cycle % num_stripes].num;
+}
+#endif
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
                            struct btrfs_chunk_map *map, int first,
                            int dev_replace_is_ongoing)
@@ -5982,6 +6042,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
        case BTRFS_READ_POLICY_PID:
                preferred_mirror = first + (current->pid % num_stripes);
                break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+       case BTRFS_READ_POLICY_RR:
+               preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+               break;
+#endif
        }
 
        if (dev_replace_is_ongoing &&
index 77926fdb6b0dd88f46fb60217f03a0b449bb5669..f9fe698a9b4b405e2ae9b3d59c825bf73ee568f3 100644 (file)
@@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy {
        BTRFS_CHUNK_ALLOC_ZONED,
 };
 
+#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ       (SZ_256K)
+/* Keep in sync with raid_attr table, current maximum is RAID1C4. */
+#define BTRFS_RAID1_MAX_MIRRORS                        (4)
 /*
  * Read policies for mirrored block group profiles, read picks the stripe based
  * on these policies.
@@ -303,6 +306,10 @@ enum btrfs_chunk_allocation_policy {
 enum btrfs_read_policy {
        /* Use process PID to choose the stripe */
        BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+       /* Balancing RAID1 reads across all striped devices (round-robin). */
+       BTRFS_READ_POLICY_RR,
+#endif
        BTRFS_NR_READ_POLICY,
 };
 
@@ -433,6 +440,12 @@ struct btrfs_fs_devices {
        enum btrfs_read_policy read_policy;
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
+       /*
+        * Minimum contiguous reads before switching to next device, the unit
+        * is one block/sectorsize.
+        */
+       u32 rr_min_contig_read;
+
        /* Checksum mode - offload it or do it synchronously. */
        enum btrfs_offload_csum_mode offload_csum_mode;
 #endif