Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / kernel / sched / fair.c
index 5f6587d94c1dd692d2d0cbcf64151e66e690de55..da0d8b0b8a2a8f39513bb537045d45a48ba69979 100644 (file)
@@ -2928,6 +2928,24 @@ static void reset_ptenuma_scan(struct task_struct *p)
        p->mm->numa_scan_offset = 0;
 }
 
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+       unsigned long pids;
+       /*
+        * Allow unconditional access first two times, so that all the (pages)
+        * of VMAs get prot_none fault introduced irrespective of accesses.
+        * This is also done to avoid any side effect of task scanning
+        * amplifying the unfairness of disjoint set of VMAs' access.
+        */
+       if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+               return true;
+
+       pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+       return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+}
+
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
@@ -3027,6 +3045,45 @@ static void task_numa_work(struct callback_head *work)
                if (!vma_is_accessible(vma))
                        continue;
 
+               /* Initialise new per-VMA NUMAB state. */
+               if (!vma->numab_state) {
+                       vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
+                               GFP_KERNEL);
+                       if (!vma->numab_state)
+                               continue;
+
+                       vma->numab_state->next_scan = now +
+                               msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+                       /* Reset happens after 4 times scan delay of scan start */
+                       vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+                               msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+               }
+
+               /*
+                * Scanning the VMA's of short lived tasks add more overhead. So
+                * delay the scan for new VMAs.
+                */
+               if (mm->numa_scan_seq && time_before(jiffies,
+                                               vma->numab_state->next_scan))
+                       continue;
+
+               /* Do not scan the VMA if task has not accessed */
+               if (!vma_is_accessed(vma))
+                       continue;
+
+               /*
+                * RESET access PIDs regularly for old VMAs. Resetting after checking
+                * vma for recent access to avoid clearing PID info before access..
+                */
+               if (mm->numa_scan_seq &&
+                               time_after(jiffies, vma->numab_state->next_pid_reset)) {
+                       vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+                               msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+                       vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
+                       vma->numab_state->access_pids[1] = 0;
+               }
+
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);