Merge tag 'mm-hotfixes-stable-2025-07-11-16-16' of git://git.kernel.org/pub/scm/linux...
[linux-2.6-block.git] / include / linux / mmap_lock.h
CommitLineData
75404e07 1/* SPDX-License-Identifier: GPL-2.0 */
9740ca4e
ML
2#ifndef _LINUX_MMAP_LOCK_H
3#define _LINUX_MMAP_LOCK_H
4
75404e07
LS
5/* Avoid a dependency loop by declaring here. */
6extern int rcuwait_wake_up(struct rcuwait *w);
7
2b5067a8
AR
8#include <linux/lockdep.h>
9#include <linux/mm_types.h>
42fc5414 10#include <linux/mmdebug.h>
2b5067a8
AR
11#include <linux/rwsem.h>
12#include <linux/tracepoint-defs.h>
13#include <linux/types.h>
c042c505 14#include <linux/cleanup.h>
42fc5414 15
14c3656b 16#define MMAP_LOCK_INITIALIZER(name) \
da1c55f1 17 .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
14c3656b 18
2b5067a8
AR
19DECLARE_TRACEPOINT(mmap_lock_start_locking);
20DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
21DECLARE_TRACEPOINT(mmap_lock_released);
22
23#ifdef CONFIG_TRACING
24
25void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
26void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
27 bool success);
28void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
29
30static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
31 bool write)
32{
33 if (tracepoint_enabled(mmap_lock_start_locking))
34 __mmap_lock_do_trace_start_locking(mm, write);
35}
36
37static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
38 bool write, bool success)
39{
40 if (tracepoint_enabled(mmap_lock_acquire_returned))
41 __mmap_lock_do_trace_acquire_returned(mm, write, success);
42}
43
44static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
45{
46 if (tracepoint_enabled(mmap_lock_released))
47 __mmap_lock_do_trace_released(mm, write);
48}
49
50#else /* !CONFIG_TRACING */
51
52static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
53 bool write)
54{
55}
56
57static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
58 bool write, bool success)
59{
60}
61
62static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
63{
64}
65
66#endif /* CONFIG_TRACING */
67
ba168b52 68static inline void mmap_assert_locked(const struct mm_struct *mm)
438b6e12 69{
ba168b52 70 rwsem_assert_held(&mm->mmap_lock);
438b6e12
SB
71}
72
ba168b52 73static inline void mmap_assert_write_locked(const struct mm_struct *mm)
438b6e12 74{
ba168b52 75 rwsem_assert_held_write(&mm->mmap_lock);
438b6e12
SB
76}
77
5e31275c 78#ifdef CONFIG_PER_VMA_LOCK
03a001b1 79
eb449bd9 80static inline void mm_lock_seqcount_init(struct mm_struct *mm)
5e31275c 81{
eb449bd9
SB
82 seqcount_init(&mm->mm_lock_seq);
83}
84
85static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
86{
87 do_raw_write_seqcount_begin(&mm->mm_lock_seq);
88}
89
90static inline void mm_lock_seqcount_end(struct mm_struct *mm)
91{
92 ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
93 do_raw_write_seqcount_end(&mm->mm_lock_seq);
5e31275c 94}
eb449bd9 95
03a001b1
SB
96static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
97{
98 /*
99 * Since mmap_lock is a sleeping lock, and waiting for it to become
100 * unlocked is more or less equivalent with taking it ourselves, don't
101 * bother with the speculative path if mmap_lock is already write-locked
102 * and take the slow path, which takes the lock.
103 */
104 return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
105}
106
107static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
108{
109 return read_seqcount_retry(&mm->mm_lock_seq, seq);
110}
111
75404e07
LS
112static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
113{
114#ifdef CONFIG_DEBUG_LOCK_ALLOC
115 static struct lock_class_key lockdep_key;
116
117 lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
118#endif
119 if (reset_refcnt)
120 refcount_set(&vma->vm_refcnt, 0);
121 vma->vm_lock_seq = UINT_MAX;
122}
123
124static inline bool is_vma_writer_only(int refcnt)
125{
126 /*
127 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
128 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
129 * a detached vma happens only in vma_mark_detached() and is a rare
130 * case, therefore most of the time there will be no unnecessary wakeup.
131 */
132 return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
133}
134
135static inline void vma_refcount_put(struct vm_area_struct *vma)
136{
137 /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
138 struct mm_struct *mm = vma->vm_mm;
139 int oldcnt;
140
141 rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
142 if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
143
144 if (is_vma_writer_only(oldcnt - 1))
145 rcuwait_wake_up(&mm->vma_writer_wait);
146 }
147}
148
149/*
150 * Try to read-lock a vma. The function is allowed to occasionally yield false
151 * locked result to avoid performance overhead, in which case we fall back to
152 * using mmap_lock. The function should never yield false unlocked result.
153 * False locked result is possible if mm_lock_seq overflows or if vma gets
154 * reused and attached to a different mm before we lock it.
155 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
156 * detached.
157 */
158static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
159 struct vm_area_struct *vma)
160{
161 int oldcnt;
162
163 /*
164 * Check before locking. A race might cause false locked result.
165 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
166 * ACQUIRE semantics, because this is just a lockless check whose result
167 * we don't rely on for anything - the mm_lock_seq read against which we
168 * need ordering is below.
169 */
170 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
171 return NULL;
172
173 /*
174 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
175 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
176 * Acquire fence is required here to avoid reordering against later
177 * vm_lock_seq check and checks inside lock_vma_under_rcu().
178 */
179 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
180 VMA_REF_LIMIT))) {
181 /* return EAGAIN if vma got detached from under us */
182 return oldcnt ? NULL : ERR_PTR(-EAGAIN);
183 }
184
185 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
186 /*
187 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
188 * False unlocked result is impossible because we modify and check
189 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
190 * modification invalidates all existing locks.
191 *
192 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
193 * racing with vma_end_write_all(), we only start reading from the VMA
194 * after it has been unlocked.
195 * This pairs with RELEASE semantics in vma_end_write_all().
196 */
197 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
198 vma_refcount_put(vma);
199 return NULL;
200 }
201
202 return vma;
203}
204
205/*
206 * Use only while holding mmap read lock which guarantees that locking will not
207 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
208 * not be used in such cases because it might fail due to mm_lock_seq overflow.
209 * This functionality is used to obtain vma read lock and drop the mmap read lock.
210 */
211static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
212{
213 int oldcnt;
214
215 mmap_assert_locked(vma->vm_mm);
216 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
217 VMA_REF_LIMIT)))
218 return false;
219
220 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
221 return true;
222}
223
224/*
225 * Use only while holding mmap read lock which guarantees that locking will not
226 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
227 * not be used in such cases because it might fail due to mm_lock_seq overflow.
228 * This functionality is used to obtain vma read lock and drop the mmap read lock.
229 */
230static inline bool vma_start_read_locked(struct vm_area_struct *vma)
231{
232 return vma_start_read_locked_nested(vma, 0);
233}
234
235static inline void vma_end_read(struct vm_area_struct *vma)
236{
237 vma_refcount_put(vma);
238}
239
240/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
241static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
242{
243 mmap_assert_write_locked(vma->vm_mm);
244
245 /*
246 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
247 * mm->mm_lock_seq can't be concurrently modified.
248 */
249 *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
250 return (vma->vm_lock_seq == *mm_lock_seq);
251}
252
253void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
254
255/*
256 * Begin writing to a VMA.
257 * Exclude concurrent readers under the per-VMA lock until the currently
258 * write-locked mmap_lock is dropped or downgraded.
259 */
260static inline void vma_start_write(struct vm_area_struct *vma)
261{
262 unsigned int mm_lock_seq;
263
264 if (__is_vma_write_locked(vma, &mm_lock_seq))
265 return;
266
267 __vma_start_write(vma, mm_lock_seq);
268}
269
270static inline void vma_assert_write_locked(struct vm_area_struct *vma)
271{
272 unsigned int mm_lock_seq;
273
274 VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
275}
276
277static inline void vma_assert_locked(struct vm_area_struct *vma)
278{
279 unsigned int mm_lock_seq;
280
281 VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
282 !__is_vma_write_locked(vma, &mm_lock_seq), vma);
283}
284
285/*
286 * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
287 * assertions should be made either under mmap_write_lock or when the object
288 * has been isolated under mmap_write_lock, ensuring no competing writers.
289 */
290static inline void vma_assert_attached(struct vm_area_struct *vma)
291{
292 WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
293}
294
295static inline void vma_assert_detached(struct vm_area_struct *vma)
296{
297 WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
298}
299
300static inline void vma_mark_attached(struct vm_area_struct *vma)
301{
302 vma_assert_write_locked(vma);
303 vma_assert_detached(vma);
304 refcount_set_release(&vma->vm_refcnt, 1);
305}
306
307void vma_mark_detached(struct vm_area_struct *vma);
308
309struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
310 unsigned long address);
311
03a001b1
SB
312#else /* CONFIG_PER_VMA_LOCK */
313
eb449bd9
SB
314static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
315static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
316static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
03a001b1
SB
317
318static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
319{
320 return false;
321}
322
323static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
324{
325 return true;
326}
75404e07
LS
327static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
328static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
329 struct vm_area_struct *vma)
330 { return NULL; }
331static inline void vma_end_read(struct vm_area_struct *vma) {}
332static inline void vma_start_write(struct vm_area_struct *vma) {}
333static inline void vma_assert_write_locked(struct vm_area_struct *vma)
334 { mmap_assert_write_locked(vma->vm_mm); }
335static inline void vma_assert_attached(struct vm_area_struct *vma) {}
336static inline void vma_assert_detached(struct vm_area_struct *vma) {}
337static inline void vma_mark_attached(struct vm_area_struct *vma) {}
338static inline void vma_mark_detached(struct vm_area_struct *vma) {}
339
340static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
341 unsigned long address)
342{
343 return NULL;
344}
345
346static inline void vma_assert_locked(struct vm_area_struct *vma)
347{
348 mmap_assert_locked(vma->vm_mm);
349}
03a001b1
SB
350
351#endif /* CONFIG_PER_VMA_LOCK */
5e31275c 352
9740ca4e
ML
353static inline void mmap_write_lock(struct mm_struct *mm)
354{
2b5067a8 355 __mmap_lock_trace_start_locking(mm, true);
da1c55f1 356 down_write(&mm->mmap_lock);
eb449bd9 357 mm_lock_seqcount_begin(mm);
2b5067a8 358 __mmap_lock_trace_acquire_returned(mm, true, true);
9740ca4e
ML
359}
360
aaa2cc56
ML
361static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
362{
2b5067a8 363 __mmap_lock_trace_start_locking(mm, true);
da1c55f1 364 down_write_nested(&mm->mmap_lock, subclass);
eb449bd9 365 mm_lock_seqcount_begin(mm);
2b5067a8 366 __mmap_lock_trace_acquire_returned(mm, true, true);
aaa2cc56
ML
367}
368
9740ca4e
ML
369static inline int mmap_write_lock_killable(struct mm_struct *mm)
370{
2b5067a8
AR
371 int ret;
372
373 __mmap_lock_trace_start_locking(mm, true);
374 ret = down_write_killable(&mm->mmap_lock);
eb449bd9
SB
375 if (!ret)
376 mm_lock_seqcount_begin(mm);
2b5067a8
AR
377 __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
378 return ret;
9740ca4e
ML
379}
380
eb449bd9
SB
381/*
382 * Drop all currently-held per-VMA locks.
383 * This is called from the mmap_lock implementation directly before releasing
384 * a write-locked mmap_lock (or downgrading it to read-locked).
385 * This should normally NOT be called manually from other places.
386 * If you want to call this manually anyway, keep in mind that this will release
387 * *all* VMA write locks, including ones from further up the stack.
388 */
389static inline void vma_end_write_all(struct mm_struct *mm)
390{
391 mmap_assert_write_locked(mm);
392 mm_lock_seqcount_end(mm);
393}
394
9740ca4e
ML
395static inline void mmap_write_unlock(struct mm_struct *mm)
396{
2b5067a8 397 __mmap_lock_trace_released(mm, true);
5e31275c 398 vma_end_write_all(mm);
10994316 399 up_write(&mm->mmap_lock);
9740ca4e
ML
400}
401
402static inline void mmap_write_downgrade(struct mm_struct *mm)
403{
2b5067a8 404 __mmap_lock_trace_acquire_returned(mm, false, true);
5e31275c 405 vma_end_write_all(mm);
10994316 406 downgrade_write(&mm->mmap_lock);
9740ca4e
ML
407}
408
409static inline void mmap_read_lock(struct mm_struct *mm)
410{
2b5067a8 411 __mmap_lock_trace_start_locking(mm, false);
da1c55f1 412 down_read(&mm->mmap_lock);
2b5067a8 413 __mmap_lock_trace_acquire_returned(mm, false, true);
9740ca4e
ML
414}
415
416static inline int mmap_read_lock_killable(struct mm_struct *mm)
417{
2b5067a8
AR
418 int ret;
419
420 __mmap_lock_trace_start_locking(mm, false);
421 ret = down_read_killable(&mm->mmap_lock);
422 __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
423 return ret;
9740ca4e
ML
424}
425
426static inline bool mmap_read_trylock(struct mm_struct *mm)
427{
2b5067a8
AR
428 bool ret;
429
430 __mmap_lock_trace_start_locking(mm, false);
431 ret = down_read_trylock(&mm->mmap_lock) != 0;
432 __mmap_lock_trace_acquire_returned(mm, false, ret);
433 return ret;
9740ca4e
ML
434}
435
436static inline void mmap_read_unlock(struct mm_struct *mm)
437{
2b5067a8 438 __mmap_lock_trace_released(mm, false);
10994316 439 up_read(&mm->mmap_lock);
9740ca4e
ML
440}
441
c042c505
PZ
442DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
443 mmap_read_lock(_T), mmap_read_unlock(_T))
444
0cc55a02
ML
445static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
446{
2b5067a8 447 __mmap_lock_trace_released(mm, false);
10994316 448 up_read_non_owner(&mm->mmap_lock);
0cc55a02
ML
449}
450
07e5bfe6
CC
451static inline int mmap_lock_is_contended(struct mm_struct *mm)
452{
453 return rwsem_is_contended(&mm->mmap_lock);
454}
455
9740ca4e 456#endif /* _LINUX_MMAP_LOCK_H */