From 7858d7bca7fbbbbd5b940d2ec371b2d060b21b84 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 30 Jun 2021 18:51:00 -0700 Subject: [PATCH] mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Ben Widawsky Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 1 - mm/mempolicy.c | 136 ++++++++++++++------------------- 2 files changed, 56 insertions(+), 81 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 4832fd0b5642..19a00bc7fe86 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -60,7 +60,6 @@ enum { * are never OR'ed into the mode in mempolicy API arguments. */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ -#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bd213b900e71..22addae6c4ee 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -121,8 +121,7 @@ enum zone_type policy_zone = 0; */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ - .mode = MPOL_PREFERRED, - .flags = MPOL_F_LOCAL, + .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; @@ -200,12 +199,9 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - if (!nodes) - pol->flags |= MPOL_F_LOCAL; /* local allocation */ - else if (nodes_empty(*nodes)) - return -EINVAL; /* no allowed nodes */ - else - pol->v.preferred_node = first_node(*nodes); + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.preferred_node = first_node(*nodes); return 0; } @@ -220,8 +216,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes - * parameter with respect to the policy mode and flags. But, we need to - * handle an empty nodemask with MPOL_PREFERRED here. + * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. @@ -231,33 +226,31 @@ static int mpol_set_nodemask(struct mempolicy *pol, { int ret; - /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ - if (pol == NULL) + /* + * Default (pol==NULL) resp. local memory policies are not a + * subject of any remapping. They also do not need any special + * constructor. + */ + if (!pol || pol->mode == MPOL_LOCAL) return 0; + /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); - if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) - nodes = NULL; /* explicit local allocation */ - else { - if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); - else - nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (mpol_store_user_nodemask(pol)) - pol->w.user_nodemask = *nodes; - else - pol->w.cpuset_mems_allowed = - cpuset_current_mems_allowed; - } + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (nodes) - ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; else - ret = mpol_ops[pol->mode].create(pol, NULL); + pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; + + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } @@ -290,13 +283,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); + + mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); - mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -344,25 +338,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - nodemask_t tmp; - - if (pol->flags & MPOL_F_STATIC_NODES) { - int node = first_node(pol->w.user_nodemask); - - if (node_isset(node, *nodes)) { - pol->v.preferred_node = node; - pol->flags &= ~MPOL_F_LOCAL; - } else - pol->flags |= MPOL_F_LOCAL; - } else if (pol->flags & MPOL_F_RELATIVE_NODES) { - mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); - pol->v.preferred_node = first_node(tmp); - } else if (!(pol->flags & MPOL_F_LOCAL)) { - pol->v.preferred_node = node_remap(pol->v.preferred_node, - pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; - } + pol->w.cpuset_mems_allowed = *nodes; } /* @@ -376,7 +352,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -427,6 +403,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, + [MPOL_LOCAL] = { + .rebind = mpol_rebind_default, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -919,10 +898,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; + case MPOL_LOCAL: + /* return empty node mask for local allocation */ + break; + case MPOL_PREFERRED: - if (!(p->flags & MPOL_F_LOCAL)) - node_set(p->v.preferred_node, *nodes); - /* else return empty node mask for local allocation */ + node_set(p->v.preferred_node, *nodes); break; default: BUG(); @@ -1894,9 +1875,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) /* Return the node id preferred by the given mempolicy, or the given id */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { - if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + if (policy->mode == MPOL_PREFERRED) { nd = policy->v.preferred_node; - else { + } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the @@ -1933,14 +1914,11 @@ unsigned int mempolicy_slab_node(void) return node; policy = current->mempolicy; - if (!policy || policy->flags & MPOL_F_LOCAL) + if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: - /* - * handled MPOL_F_LOCAL above - */ return policy->v.preferred_node; case MPOL_INTERLEAVE: @@ -1960,6 +1938,8 @@ unsigned int mempolicy_slab_node(void) &policy->v.nodes); return z->zone ? zone_to_nid(z->zone) : node; } + case MPOL_LOCAL: + return node; default: BUG(); @@ -2072,16 +2052,18 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: - if (mempolicy->flags & MPOL_F_LOCAL) - nid = numa_node_id(); - else - nid = mempolicy->v.preferred_node; + nid = mempolicy->v.preferred_node; init_nodemask_of_node(mask, nid); break; case MPOL_BIND: case MPOL_INTERLEAVE: - *mask = mempolicy->v.nodes; + *mask = mempolicy->v.nodes; + break; + + case MPOL_LOCAL: + nid = numa_node_id(); + init_nodemask_of_node(mask, nid); break; default: @@ -2188,7 +2170,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) + if (pol->mode == MPOL_PREFERRED) hpage_node = pol->v.preferred_node; nmask = policy_nodemask(gfp, pol); @@ -2324,10 +2306,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - /* a's ->flags is the same as b's */ - if (a->flags & MPOL_F_LOCAL) - return true; return a->v.preferred_node == b->v.preferred_node; + case MPOL_LOCAL: + return true; default: BUG(); return false; @@ -2465,10 +2446,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: - if (pol->flags & MPOL_F_LOCAL) - polnid = numa_node_id(); - else - polnid = pol->v.preferred_node; + polnid = pol->v.preferred_node; + break; + + case MPOL_LOCAL: + polnid = numa_node_id(); break; case MPOL_BIND: @@ -2835,9 +2817,6 @@ void numa_default_policy(void) * Parse and format mempolicy from/to strings */ -/* - * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. - */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2915,7 +2894,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) */ if (nodelist) goto out; - mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* @@ -2959,7 +2937,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) else if (nodelist) new->v.preferred_node = first_node(nodes); else - new->flags |= MPOL_F_LOCAL; + new->mode = MPOL_LOCAL; /* * Save nodes for contextualization: this will be used to "clone" @@ -3005,12 +2983,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) switch (mode) { case MPOL_DEFAULT: + case MPOL_LOCAL: break; case MPOL_PREFERRED: - if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; - else - node_set(pol->v.preferred_node, nodes); + node_set(pol->v.preferred_node, nodes); break; case MPOL_BIND: case MPOL_INTERLEAVE: -- 2.25.1