Commit | Line | Data |
---|---|---|
769071ac AV |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Author: Andrei Vagin <avagin@openvz.org> | |
4 | * Author: Dmitry Safonov <dima@arista.com> | |
5 | */ | |
6 | ||
7 | #include <linux/time_namespace.h> | |
8 | #include <linux/user_namespace.h> | |
9 | #include <linux/sched/signal.h> | |
10 | #include <linux/sched/task.h> | |
2d6b01bd | 11 | #include <linux/clocksource.h> |
04a8682a | 12 | #include <linux/seq_file.h> |
769071ac AV |
13 | #include <linux/proc_ns.h> |
14 | #include <linux/export.h> | |
15 | #include <linux/time.h> | |
16 | #include <linux/slab.h> | |
17 | #include <linux/cred.h> | |
18 | #include <linux/err.h> | |
af993f58 | 19 | #include <linux/mm.h> |
769071ac | 20 | |
afaa7b5a DS |
21 | #include <vdso/datapage.h> |
22 | ||
89dd8eec AV |
23 | ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, |
24 | struct timens_offsets *ns_offsets) | |
25 | { | |
26 | ktime_t offset; | |
27 | ||
28 | switch (clockid) { | |
29 | case CLOCK_MONOTONIC: | |
30 | offset = timespec64_to_ktime(ns_offsets->monotonic); | |
31 | break; | |
32 | case CLOCK_BOOTTIME: | |
33 | case CLOCK_BOOTTIME_ALARM: | |
34 | offset = timespec64_to_ktime(ns_offsets->boottime); | |
35 | break; | |
36 | default: | |
37 | return tim; | |
38 | } | |
39 | ||
40 | /* | |
41 | * Check that @tim value is in [offset, KTIME_MAX + offset] | |
42 | * and subtract offset. | |
43 | */ | |
44 | if (tim < offset) { | |
45 | /* | |
46 | * User can specify @tim *absolute* value - if it's lesser than | |
47 | * the time namespace's offset - it's already expired. | |
48 | */ | |
49 | tim = 0; | |
50 | } else { | |
51 | tim = ktime_sub(tim, offset); | |
52 | if (unlikely(tim > KTIME_MAX)) | |
53 | tim = KTIME_MAX; | |
54 | } | |
55 | ||
56 | return tim; | |
57 | } | |
58 | ||
769071ac AV |
59 | static struct ucounts *inc_time_namespaces(struct user_namespace *ns) |
60 | { | |
61 | return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); | |
62 | } | |
63 | ||
64 | static void dec_time_namespaces(struct ucounts *ucounts) | |
65 | { | |
66 | dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); | |
67 | } | |
68 | ||
69 | /** | |
70 | * clone_time_ns - Clone a time namespace | |
71 | * @user_ns: User namespace which owns a new namespace. | |
72 | * @old_ns: Namespace to clone | |
73 | * | |
74 | * Clone @old_ns and set the clone refcount to 1 | |
75 | * | |
76 | * Return: The new namespace or ERR_PTR. | |
77 | */ | |
78 | static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, | |
79 | struct time_namespace *old_ns) | |
80 | { | |
81 | struct time_namespace *ns; | |
82 | struct ucounts *ucounts; | |
83 | int err; | |
84 | ||
85 | err = -ENOSPC; | |
86 | ucounts = inc_time_namespaces(user_ns); | |
87 | if (!ucounts) | |
88 | goto fail; | |
89 | ||
90 | err = -ENOMEM; | |
30acd0bd | 91 | ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); |
769071ac AV |
92 | if (!ns) |
93 | goto fail_dec; | |
94 | ||
28c41efd | 95 | refcount_set(&ns->ns.count, 1); |
769071ac | 96 | |
30acd0bd | 97 | ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
afaa7b5a DS |
98 | if (!ns->vvar_page) |
99 | goto fail_free; | |
100 | ||
769071ac AV |
101 | err = ns_alloc_inum(&ns->ns); |
102 | if (err) | |
afaa7b5a | 103 | goto fail_free_page; |
769071ac AV |
104 | |
105 | ns->ucounts = ucounts; | |
106 | ns->ns.ops = &timens_operations; | |
107 | ns->user_ns = get_user_ns(user_ns); | |
af993f58 | 108 | ns->offsets = old_ns->offsets; |
afaa7b5a | 109 | ns->frozen_offsets = false; |
769071ac AV |
110 | return ns; |
111 | ||
afaa7b5a DS |
112 | fail_free_page: |
113 | __free_page(ns->vvar_page); | |
769071ac AV |
114 | fail_free: |
115 | kfree(ns); | |
116 | fail_dec: | |
117 | dec_time_namespaces(ucounts); | |
118 | fail: | |
119 | return ERR_PTR(err); | |
120 | } | |
121 | ||
122 | /** | |
123 | * copy_time_ns - Create timens_for_children from @old_ns | |
124 | * @flags: Cloning flags | |
125 | * @user_ns: User namespace which owns a new namespace. | |
126 | * @old_ns: Namespace to clone | |
127 | * | |
128 | * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; | |
129 | * adds a refcounter to @old_ns otherwise. | |
130 | * | |
131 | * Return: timens_for_children namespace or ERR_PTR. | |
132 | */ | |
133 | struct time_namespace *copy_time_ns(unsigned long flags, | |
134 | struct user_namespace *user_ns, struct time_namespace *old_ns) | |
135 | { | |
136 | if (!(flags & CLONE_NEWTIME)) | |
137 | return get_time_ns(old_ns); | |
138 | ||
139 | return clone_time_ns(user_ns, old_ns); | |
140 | } | |
141 | ||
afaa7b5a DS |
142 | static struct timens_offset offset_from_ts(struct timespec64 off) |
143 | { | |
144 | struct timens_offset ret; | |
145 | ||
146 | ret.sec = off.tv_sec; | |
147 | ret.nsec = off.tv_nsec; | |
148 | ||
149 | return ret; | |
150 | } | |
151 | ||
152 | /* | |
153 | * A time namespace VVAR page has the same layout as the VVAR page which | |
154 | * contains the system wide VDSO data. | |
155 | * | |
156 | * For a normal task the VVAR pages are installed in the normal ordering: | |
157 | * VVAR | |
158 | * PVCLOCK | |
159 | * HVCLOCK | |
160 | * TIMENS <- Not really required | |
161 | * | |
162 | * Now for a timens task the pages are installed in the following order: | |
163 | * TIMENS | |
164 | * PVCLOCK | |
165 | * HVCLOCK | |
166 | * VVAR | |
167 | * | |
168 | * The check for vdso_data->clock_mode is in the unlikely path of | |
169 | * the seq begin magic. So for the non-timens case most of the time | |
170 | * 'seq' is even, so the branch is not taken. | |
171 | * | |
172 | * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check | |
173 | * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the | |
174 | * update to finish and for 'seq' to become even anyway. | |
175 | * | |
2d6b01bd TG |
176 | * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which |
177 | * enforces the time namespace handling path. | |
afaa7b5a DS |
178 | */ |
179 | static void timens_setup_vdso_data(struct vdso_data *vdata, | |
180 | struct time_namespace *ns) | |
181 | { | |
182 | struct timens_offset *offset = vdata->offset; | |
183 | struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); | |
184 | struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); | |
185 | ||
186 | vdata->seq = 1; | |
2d6b01bd | 187 | vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; |
afaa7b5a DS |
188 | offset[CLOCK_MONOTONIC] = monotonic; |
189 | offset[CLOCK_MONOTONIC_RAW] = monotonic; | |
190 | offset[CLOCK_MONOTONIC_COARSE] = monotonic; | |
191 | offset[CLOCK_BOOTTIME] = boottime; | |
192 | offset[CLOCK_BOOTTIME_ALARM] = boottime; | |
193 | } | |
194 | ||
d6c494e8 JH |
195 | struct page *find_timens_vvar_page(struct vm_area_struct *vma) |
196 | { | |
197 | if (likely(vma->vm_mm == current->mm)) | |
198 | return current->nsproxy->time_ns->vvar_page; | |
199 | ||
200 | /* | |
201 | * VM_PFNMAP | VM_IO protect .fault() handler from being called | |
202 | * through interfaces like /proc/$pid/mem or | |
203 | * process_vm_{readv,writev}() as long as there's no .access() | |
204 | * in special_mapping_vmops(). | |
205 | * For more details check_vma_flags() and __access_remote_vm() | |
206 | */ | |
207 | ||
208 | WARN(1, "vvar_page accessed remotely"); | |
209 | ||
210 | return NULL; | |
211 | } | |
212 | ||
afaa7b5a DS |
213 | /* |
214 | * Protects possibly multiple offsets writers racing each other | |
215 | * and tasks entering the namespace. | |
216 | */ | |
217 | static DEFINE_MUTEX(offset_lock); | |
218 | ||
219 | static void timens_set_vvar_page(struct task_struct *task, | |
220 | struct time_namespace *ns) | |
221 | { | |
222 | struct vdso_data *vdata; | |
223 | unsigned int i; | |
224 | ||
225 | if (ns == &init_time_ns) | |
226 | return; | |
227 | ||
228 | /* Fast-path, taken by every task in namespace except the first. */ | |
229 | if (likely(ns->frozen_offsets)) | |
230 | return; | |
231 | ||
232 | mutex_lock(&offset_lock); | |
233 | /* Nothing to-do: vvar_page has been already initialized. */ | |
234 | if (ns->frozen_offsets) | |
235 | goto out; | |
236 | ||
237 | ns->frozen_offsets = true; | |
238 | vdata = arch_get_vdso_data(page_address(ns->vvar_page)); | |
239 | ||
240 | for (i = 0; i < CS_BASES; i++) | |
241 | timens_setup_vdso_data(&vdata[i], ns); | |
242 | ||
243 | out: | |
244 | mutex_unlock(&offset_lock); | |
245 | } | |
246 | ||
28c41efd | 247 | void free_time_ns(struct time_namespace *ns) |
769071ac | 248 | { |
769071ac AV |
249 | dec_time_namespaces(ns->ucounts); |
250 | put_user_ns(ns->user_ns); | |
251 | ns_free_inum(&ns->ns); | |
afaa7b5a | 252 | __free_page(ns->vvar_page); |
769071ac AV |
253 | kfree(ns); |
254 | } | |
255 | ||
256 | static struct time_namespace *to_time_ns(struct ns_common *ns) | |
257 | { | |
258 | return container_of(ns, struct time_namespace, ns); | |
259 | } | |
260 | ||
261 | static struct ns_common *timens_get(struct task_struct *task) | |
262 | { | |
263 | struct time_namespace *ns = NULL; | |
264 | struct nsproxy *nsproxy; | |
265 | ||
266 | task_lock(task); | |
267 | nsproxy = task->nsproxy; | |
268 | if (nsproxy) { | |
269 | ns = nsproxy->time_ns; | |
270 | get_time_ns(ns); | |
271 | } | |
272 | task_unlock(task); | |
273 | ||
274 | return ns ? &ns->ns : NULL; | |
275 | } | |
276 | ||
277 | static struct ns_common *timens_for_children_get(struct task_struct *task) | |
278 | { | |
279 | struct time_namespace *ns = NULL; | |
280 | struct nsproxy *nsproxy; | |
281 | ||
282 | task_lock(task); | |
283 | nsproxy = task->nsproxy; | |
284 | if (nsproxy) { | |
285 | ns = nsproxy->time_ns_for_children; | |
286 | get_time_ns(ns); | |
287 | } | |
288 | task_unlock(task); | |
289 | ||
290 | return ns ? &ns->ns : NULL; | |
291 | } | |
292 | ||
293 | static void timens_put(struct ns_common *ns) | |
294 | { | |
295 | put_time_ns(to_time_ns(ns)); | |
296 | } | |
297 | ||
76c12881 | 298 | void timens_commit(struct task_struct *tsk, struct time_namespace *ns) |
5cfea9a1 CB |
299 | { |
300 | timens_set_vvar_page(tsk, ns); | |
301 | vdso_join_timens(tsk, ns); | |
302 | } | |
303 | ||
f2a8d52e | 304 | static int timens_install(struct nsset *nsset, struct ns_common *new) |
769071ac | 305 | { |
f2a8d52e | 306 | struct nsproxy *nsproxy = nsset->nsproxy; |
769071ac AV |
307 | struct time_namespace *ns = to_time_ns(new); |
308 | ||
309 | if (!current_is_single_threaded()) | |
310 | return -EUSERS; | |
311 | ||
312 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | |
f2a8d52e | 313 | !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) |
769071ac AV |
314 | return -EPERM; |
315 | ||
316 | get_time_ns(ns); | |
317 | put_time_ns(nsproxy->time_ns); | |
318 | nsproxy->time_ns = ns; | |
319 | ||
320 | get_time_ns(ns); | |
321 | put_time_ns(nsproxy->time_ns_for_children); | |
322 | nsproxy->time_ns_for_children = ns; | |
323 | return 0; | |
324 | } | |
325 | ||
5c62634f | 326 | void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) |
769071ac AV |
327 | { |
328 | struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; | |
329 | struct time_namespace *ns = to_time_ns(nsc); | |
330 | ||
331 | /* create_new_namespaces() already incremented the ref counter */ | |
332 | if (nsproxy->time_ns == nsproxy->time_ns_for_children) | |
5c62634f | 333 | return; |
769071ac AV |
334 | |
335 | get_time_ns(ns); | |
336 | put_time_ns(nsproxy->time_ns); | |
337 | nsproxy->time_ns = ns; | |
338 | ||
5cfea9a1 | 339 | timens_commit(tsk, ns); |
769071ac AV |
340 | } |
341 | ||
342 | static struct user_namespace *timens_owner(struct ns_common *ns) | |
343 | { | |
344 | return to_time_ns(ns)->user_ns; | |
345 | } | |
346 | ||
04a8682a AV |
347 | static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) |
348 | { | |
94d440d6 AV |
349 | char *clock; |
350 | ||
351 | switch (clockid) { | |
352 | case CLOCK_BOOTTIME: | |
353 | clock = "boottime"; | |
354 | break; | |
355 | case CLOCK_MONOTONIC: | |
356 | clock = "monotonic"; | |
357 | break; | |
358 | default: | |
359 | clock = "unknown"; | |
360 | break; | |
361 | } | |
362 | seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); | |
04a8682a AV |
363 | } |
364 | ||
365 | void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) | |
366 | { | |
367 | struct ns_common *ns; | |
368 | struct time_namespace *time_ns; | |
369 | ||
370 | ns = timens_for_children_get(p); | |
371 | if (!ns) | |
372 | return; | |
373 | time_ns = to_time_ns(ns); | |
374 | ||
375 | show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); | |
376 | show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); | |
377 | put_time_ns(time_ns); | |
378 | } | |
379 | ||
380 | int proc_timens_set_offset(struct file *file, struct task_struct *p, | |
381 | struct proc_timens_offset *offsets, int noffsets) | |
382 | { | |
383 | struct ns_common *ns; | |
384 | struct time_namespace *time_ns; | |
385 | struct timespec64 tp; | |
386 | int i, err; | |
387 | ||
388 | ns = timens_for_children_get(p); | |
389 | if (!ns) | |
390 | return -ESRCH; | |
391 | time_ns = to_time_ns(ns); | |
392 | ||
393 | if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { | |
394 | put_time_ns(time_ns); | |
395 | return -EPERM; | |
396 | } | |
397 | ||
398 | for (i = 0; i < noffsets; i++) { | |
399 | struct proc_timens_offset *off = &offsets[i]; | |
400 | ||
401 | switch (off->clockid) { | |
402 | case CLOCK_MONOTONIC: | |
403 | ktime_get_ts64(&tp); | |
404 | break; | |
405 | case CLOCK_BOOTTIME: | |
406 | ktime_get_boottime_ts64(&tp); | |
407 | break; | |
408 | default: | |
409 | err = -EINVAL; | |
410 | goto out; | |
411 | } | |
412 | ||
413 | err = -ERANGE; | |
414 | ||
415 | if (off->val.tv_sec > KTIME_SEC_MAX || | |
416 | off->val.tv_sec < -KTIME_SEC_MAX) | |
417 | goto out; | |
418 | ||
419 | tp = timespec64_add(tp, off->val); | |
420 | /* | |
421 | * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is | |
422 | * still unreachable. | |
423 | */ | |
424 | if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) | |
425 | goto out; | |
426 | } | |
427 | ||
428 | mutex_lock(&offset_lock); | |
429 | if (time_ns->frozen_offsets) { | |
430 | err = -EACCES; | |
431 | goto out_unlock; | |
432 | } | |
433 | ||
434 | err = 0; | |
435 | /* Don't report errors after this line */ | |
436 | for (i = 0; i < noffsets; i++) { | |
437 | struct proc_timens_offset *off = &offsets[i]; | |
438 | struct timespec64 *offset = NULL; | |
439 | ||
440 | switch (off->clockid) { | |
441 | case CLOCK_MONOTONIC: | |
442 | offset = &time_ns->offsets.monotonic; | |
443 | break; | |
444 | case CLOCK_BOOTTIME: | |
445 | offset = &time_ns->offsets.boottime; | |
446 | break; | |
447 | } | |
448 | ||
449 | *offset = off->val; | |
450 | } | |
451 | ||
452 | out_unlock: | |
453 | mutex_unlock(&offset_lock); | |
454 | out: | |
455 | put_time_ns(time_ns); | |
456 | ||
457 | return err; | |
458 | } | |
459 | ||
769071ac AV |
460 | const struct proc_ns_operations timens_operations = { |
461 | .name = "time", | |
462 | .type = CLONE_NEWTIME, | |
463 | .get = timens_get, | |
464 | .put = timens_put, | |
465 | .install = timens_install, | |
466 | .owner = timens_owner, | |
467 | }; | |
468 | ||
469 | const struct proc_ns_operations timens_for_children_operations = { | |
470 | .name = "time_for_children", | |
b801f1e2 | 471 | .real_ns_name = "time", |
769071ac AV |
472 | .type = CLONE_NEWTIME, |
473 | .get = timens_for_children_get, | |
474 | .put = timens_put, | |
475 | .install = timens_install, | |
476 | .owner = timens_owner, | |
477 | }; | |
478 | ||
479 | struct time_namespace init_time_ns = { | |
28c41efd | 480 | .ns.count = REFCOUNT_INIT(3), |
769071ac AV |
481 | .user_ns = &init_user_ns, |
482 | .ns.inum = PROC_TIME_INIT_INO, | |
483 | .ns.ops = &timens_operations, | |
afaa7b5a | 484 | .frozen_offsets = true, |
769071ac | 485 | }; |