ceph: update support for PGID64, PGPOOL3, OSDENC protocol features
authorSage Weil <sage@inktank.com>
Sat, 23 Feb 2013 18:41:09 +0000 (10:41 -0800)
committerSage Weil <sage@inktank.com>
Tue, 26 Feb 2013 23:02:25 +0000 (15:02 -0800)
Support (and require) the PGID64, PGPOOL3, and OSDENC protocol features.
These have been present in ceph.git since v0.42, Feb 2012.  Require these
features to simplify support; nobody is running older userspace.

Note that the new request and reply encoding is still not in place, so the new
code is not yet functional.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
fs/ceph/mdsmap.c
include/linux/ceph/ceph_features.h
include/linux/ceph/mdsmap.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
net/ceph/ceph_common.c
net/ceph/debugfs.c
net/ceph/osdmap.c

index 73b7d44e8a354264e3f08f66e8cb788851328029..0d3c9240c61bc80031f85a94ba462809b1d7249a 100644 (file)
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                return ERR_PTR(-ENOMEM);
 
        ceph_decode_16_safe(p, end, version, bad);
+       if (version > 3) {
+               pr_warning("got mdsmap version %d > 3, failing", version);
+               goto bad;
+       }
 
        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
        m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        /* pg_pools */
        ceph_decode_32_safe(p, end, n, bad);
        m->m_num_data_pg_pools = n;
-       m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+       m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
        if (!m->m_data_pg_pools)
                goto badmem;
-       ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+       ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
        for (i = 0; i < n; i++)
-               m->m_data_pg_pools[i] = ceph_decode_32(p);
-       m->m_cas_pg_pool = ceph_decode_32(p);
+               m->m_data_pg_pools[i] = ceph_decode_64(p);
+       m->m_cas_pg_pool = ceph_decode_64(p);
 
        /* ok, we don't care about the rest. */
        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
index 9e0f5a8ba2470c90716061abee3e8d927666205d..ab0a54286e0d6663e8ec5b9ecf8b4339b1c75ca1 100644 (file)
  * Features supported.
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
-       (CEPH_FEATURE_NOSRCADDR |        \
-        CEPH_FEATURE_CRUSH_TUNABLES |    \
-        CEPH_FEATURE_CRUSH_TUNABLES2 |   \
+       (CEPH_FEATURE_NOSRCADDR |               \
+        CEPH_FEATURE_PGID64 |                  \
+        CEPH_FEATURE_PGPOOL3 |                 \
+        CEPH_FEATURE_OSDENC |                  \
+        CEPH_FEATURE_CRUSH_TUNABLES |          \
+        CEPH_FEATURE_CRUSH_TUNABLES2 |         \
         CEPH_FEATURE_REPLY_CREATE_INODE)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
-       (CEPH_FEATURE_NOSRCADDR)
+       (CEPH_FEATURE_NOSRCADDR |        \
+        CEPH_FEATURE_PGID64 |           \
+        CEPH_FEATURE_PGPOOL3 |          \
+        CEPH_FEATURE_OSDENC)
 #endif
index cb15b5d867c75b3c1ab20e883b04e2ac43f02b52..87ed09f548007eb5ce6e8e50651dfc0484387091 100644 (file)
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
 
        /* which object pools file data can be stored in */
        int m_num_data_pg_pools;
-       u32 *m_data_pg_pools;
-       u32 m_cas_pg_pool;
+       u64 *m_data_pg_pools;
+       u64 m_cas_pg_pool;
 };
 
 static inline struct ceph_entity_addr *
index 8a612df4c248bf585708023ee52d6709fa900afa..8587746b7f0eb99a46a1be176b03ce663b68c8df 100644 (file)
@@ -25,12 +25,22 @@ struct ceph_pg {
 
 struct ceph_pg_pool_info {
        struct rb_node node;
-       int id;
-       struct ceph_pg_pool v;
-       int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+       s64 id;
+       u8 type;
+       u8 size;
+       u8 crush_ruleset;
+       u8 object_hash;
+       u32 pg_num, pgp_num;
+       int pg_num_mask, pgp_num_mask;
+       u64 flags;
        char *name;
 };
 
+struct ceph_object_locator {
+       uint64_t pool;
+       char *key;
+};
+
 struct ceph_pg_mapping {
        struct rb_node node;
        struct ceph_pg pgid;
index e7cece69b13f84aa8057ccc0f73cbdec53cd433a..d784c8dfb09ac7f0eca04eda422507d69428a7e7 100644 (file)
@@ -8,14 +8,6 @@
 
 #include <linux/ceph/msgr.h>
 
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 6
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     6
-
 /*
  * fs id
  */
@@ -91,21 +83,6 @@ struct ceph_pg_v1 {
 
 #define CEPH_PG_TYPE_REP     1
 #define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-       __u8 type;                /* CEPH_PG_TYPE_* */
-       __u8 size;                /* number of osds in each pg */
-       __u8 crush_ruleset;       /* crush placement rule */
-       __u8 object_hash;         /* hash mapping object name to ps */
-       __le32 pg_num, pgp_num;   /* number of pg's */
-       __le32 lpg_num, lpgp_num; /* number of localized pg's */
-       __le32 last_change;       /* most recent epoch changed */
-       __le64 snap_seq;          /* seq for per-pool snapshot */
-       __le32 snap_epoch;        /* epoch of last snap */
-       __le32 num_snaps;
-       __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-       __le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
 
 /*
  * stable_mod func is used to control number of placement groups.
index c236c235c4a2499e11996c38ea8e806d78e6621a..c5605ae9671448d4a3a72b96339bcb9fcebe7fe8 100644 (file)
@@ -601,10 +601,8 @@ static int __init init_ceph_lib(void)
        if (ret < 0)
                goto out_crypto;
 
-       pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
-               CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-               CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-               CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+       pr_info("loaded (mon/osd proto %d/%d)\n",
+               CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
 
        return 0;
 
index 61a9af634f8bf5a57574c88306487ec7b9bfe48f..f4d4b27d6026dbd01324c50abc6ae6a9fc28f77b 100644 (file)
@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p)
        for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
                struct ceph_pg_pool_info *pool =
                        rb_entry(n, struct ceph_pg_pool_info, node);
-               seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-                          pool->id, pool->v.pg_num, pool->pg_num_mask,
-                          pool->v.lpg_num, pool->lpg_num_mask);
+               seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
+                          (unsigned long long)pool->id, pool->pg_num,
+                          pool->pg_num_mask);
        }
        for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
                struct ceph_entity_addr *addr =
index 81118db5bd11955659aff7e9527ca75f7e0e6539..911919320d2e1a7bf561e52648ba0c6ee0c7a8d0 100644 (file)
@@ -45,13 +45,8 @@ static int calc_bits_of(unsigned int t)
  */
 static void calc_pg_masks(struct ceph_pg_pool_info *pi)
 {
-       pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-       pi->pgp_num_mask =
-               (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-       pi->lpg_num_mask =
-               (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-       pi->lpgp_num_mask =
-               (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+       pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
+       pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
 }
 
 /*
@@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
        return 0;
 }
 
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
 {
        struct ceph_pg_pool_info *pi;
        struct rb_node *n = root->rb_node;
@@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 
 static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
-       unsigned int n, m;
+       u8 ev, cv;
+       unsigned len, num;
+       void *pool_end;
+
+       ceph_decode_need(p, end, 2 + 4, bad);
+       ev = ceph_decode_8(p);  /* encoding version */
+       cv = ceph_decode_8(p); /* compat version */
+       if (ev < 5) {
+               pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
+               return -EINVAL;
+       }
+       if (cv > 7) {
+               pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
+               return -EINVAL;
+       }
+       len = ceph_decode_32(p);
+       ceph_decode_need(p, end, len, bad);
+       pool_end = *p + len;
 
-       ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-       calc_pg_masks(pi);
+       pi->type = ceph_decode_8(p);
+       pi->size = ceph_decode_8(p);
+       pi->crush_ruleset = ceph_decode_8(p);
+       pi->object_hash = ceph_decode_8(p);
 
-       /* num_snaps * snap_info_t */
-       n = le32_to_cpu(pi->v.num_snaps);
-       while (n--) {
-               ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-                                sizeof(struct ceph_timespec), bad);
-               *p += sizeof(u64) +       /* key */
-                       1 + sizeof(u64) + /* u8, snapid */
-                       sizeof(struct ceph_timespec);
-               m = ceph_decode_32(p);    /* snap name */
-               *p += m;
+       pi->pg_num = ceph_decode_32(p);
+       pi->pgp_num = ceph_decode_32(p);
+
+       *p += 4 + 4;  /* skip lpg* */
+       *p += 4;      /* skip last_change */
+       *p += 8 + 4;  /* skip snap_seq, snap_epoch */
+
+       /* skip snaps */
+       num = ceph_decode_32(p);
+       while (num--) {
+               *p += 8;  /* snapid key */
+               *p += 1 + 1; /* versions */
+               len = ceph_decode_32(p);
+               *p += len;
        }
 
-       *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+       /* skip removed snaps */
+       num = ceph_decode_32(p);
+       *p += num * (8 + 8);
+
+       *p += 8;  /* skip auid */
+       pi->flags = ceph_decode_64(p);
+
+       /* ignore the rest */
+
+       *p = pool_end;
+       calc_pg_masks(pi);
        return 0;
 
 bad:
@@ -535,14 +563,15 @@ bad:
 static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 {
        struct ceph_pg_pool_info *pi;
-       u32 num, len, pool;
+       u32 num, len;
+       u64 pool;
 
        ceph_decode_32_safe(p, end, num, bad);
        dout(" %d pool names\n", num);
        while (num--) {
-               ceph_decode_32_safe(p, end, pool, bad);
+               ceph_decode_64_safe(p, end, pool, bad);
                ceph_decode_32_safe(p, end, len, bad);
-               dout("  pool %d len %d\n", pool, len);
+               dout("  pool %llu len %d\n", pool, len);
                ceph_decode_need(p, end, len, bad);
                pi = __lookup_pg_pool(&map->pg_pools, pool);
                if (pi) {
@@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        struct ceph_osdmap *map;
        u16 version;
        u32 len, max, i;
-       u8 ev;
        int err = -EINVAL;
        void *start = *p;
        struct ceph_pg_pool_info *pi;
@@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        map->pg_temp = RB_ROOT;
 
        ceph_decode_16_safe(p, end, version, bad);
-       if (version > CEPH_OSDMAP_VERSION) {
-               pr_warning("got unknown v %d > %d of osdmap\n", version,
-                          CEPH_OSDMAP_VERSION);
+       if (version > 6) {
+               pr_warning("got unknown v %d > 6 of osdmap\n", version);
+               goto bad;
+       }
+       if (version < 6) {
+               pr_warning("got old v %d < 6 of osdmap\n", version);
                goto bad;
        }
 
@@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
        ceph_decode_32_safe(p, end, max, bad);
        while (max--) {
-               ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+               ceph_decode_need(p, end, 8 + 2, bad);
                err = -ENOMEM;
                pi = kzalloc(sizeof(*pi), GFP_NOFS);
                if (!pi)
                        goto bad;
-               pi->id = ceph_decode_32(p);
-               err = -EINVAL;
-               ev = ceph_decode_8(p); /* encoding version */
-               if (ev > CEPH_PG_POOL_VERSION) {
-                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                  ev, CEPH_PG_POOL_VERSION);
-                       kfree(pi);
-                       goto bad;
-               }
+               pi->id = ceph_decode_64(p);
                err = __decode_pool(p, end, pi);
                if (err < 0) {
                        kfree(pi);
@@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                __insert_pg_pool(&map->pg_pools, pi);
        }
 
-       if (version >= 5) {
-               err = __decode_pool_names(p, end, map);
-               if (err < 0) {
-                       dout("fail to decode pool names");
-                       goto bad;
-               }
+       err = __decode_pool_names(p, end, map);
+       if (err < 0) {
+               dout("fail to decode pool names");
+               goto bad;
        }
 
        ceph_decode_32_safe(p, end, map->pool_max, bad);
@@ -788,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        struct ceph_fsid fsid;
        u32 epoch = 0;
        struct ceph_timespec modified;
-       u32 len, pool;
-       __s32 new_pool_max, new_flags, max;
+       s32 len;
+       u64 pool;
+       __s64 new_pool_max;
+       __s32 new_flags, max;
        void *start = *p;
        int err = -EINVAL;
        u16 version;
 
        ceph_decode_16_safe(p, end, version, bad);
-       if (version > CEPH_OSDMAP_INC_VERSION) {
-               pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-                          CEPH_OSDMAP_INC_VERSION);
+       if (version > 6) {
+               pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6);
                goto bad;
        }
 
@@ -807,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        epoch = ceph_decode_32(p);
        BUG_ON(epoch != map->epoch+1);
        ceph_decode_copy(p, &modified, sizeof(modified));
-       new_pool_max = ceph_decode_32(p);
+       new_pool_max = ceph_decode_64(p);
        new_flags = ceph_decode_32(p);
 
        /* full map? */
@@ -857,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        /* new_pool */
        ceph_decode_32_safe(p, end, len, bad);
        while (len--) {
-               __u8 ev;
                struct ceph_pg_pool_info *pi;
 
-               ceph_decode_32_safe(p, end, pool, bad);
-               ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-               ev = ceph_decode_8(p);  /* encoding version */
-               if (ev > CEPH_PG_POOL_VERSION) {
-                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                  ev, CEPH_PG_POOL_VERSION);
-                       err = -EINVAL;
-                       goto bad;
-               }
+               ceph_decode_64_safe(p, end, pool, bad);
                pi = __lookup_pg_pool(&map->pg_pools, pool);
                if (!pi) {
                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
@@ -894,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        while (len--) {
                struct ceph_pg_pool_info *pi;
 
-               ceph_decode_32_safe(p, end, pool, bad);
+               ceph_decode_64_safe(p, end, pool, bad);
                pi = __lookup_pg_pool(&map->pg_pools, pool);
                if (pi)
                        __remove_pg_pool(&map->pg_pools, pi);
@@ -1097,8 +1110,8 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
        pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
        if (!pool)
                return -EIO;
-       pgid.seed = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-       num = le32_to_cpu(pool->v.pg_num);
+       pgid.seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
+       num = pool->pg_num;
        num_mask = pool->pg_num_mask;
 
        dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool,
@@ -1132,8 +1145,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
                return NULL;
 
        /* pg_temp? */
-       t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
-                           pool->pgp_num_mask);
+       t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask);
        pgid.seed = t;
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
@@ -1142,26 +1154,24 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
        }
 
        /* crush */
-       ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-                                pool->v.type, pool->v.size);
+       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+                                pool->type, pool->size);
        if (ruleno < 0) {
                pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-                      poolid, pool->v.crush_ruleset, pool->v.type,
-                      pool->v.size);
+                      poolid, pool->crush_ruleset, pool->type,
+                      pool->size);
                return NULL;
        }
 
-       pps = ceph_stable_mod(ps,
-                             le32_to_cpu(pool->v.pgp_num),
-                             pool->pgp_num_mask);
+       pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask);
        pps += poolid;
        r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-                         min_t(int, pool->v.size, *num),
+                         min_t(int, pool->size, *num),
                          osdmap->osd_weight);
        if (r < 0) {
                pr_err("error %d from crush rule: pool %d ruleset %d type %d"
-                      " size %d\n", r, poolid, pool->v.crush_ruleset,
-                      pool->v.type, pool->v.size);
+                      " size %d\n", r, poolid, pool->crush_ruleset,
+                      pool->type, pool->size);
                return NULL;
        }
        *num = r;