blk-mq: really fix plug list flushing for nomerge queues
[linux-2.6-block.git] / kernel / trace / trace_events.c
index 6f965864cc029ad47f63c67f6713cb79a519599e..3d4155892a1e78cf6b8b7850e9a64c460fe825b5 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/bsearch.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
        }
 }
 
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+       struct ftrace_event_field *tail;
+       struct list_head *head;
+
+       head = trace_get_fields(call);
+       /*
+        * head->next points to the last field with the largest offset,
+        * since it was added last by trace_define_field()
+        */
+       tail = list_first_entry(head, struct ftrace_event_field, link);
+       return tail->offset + tail->size;
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
        int id;
@@ -363,6 +381,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 {
        struct trace_event_call *call = file->event_call;
        struct trace_array *tr = file->tr;
+       unsigned long file_flags = file->flags;
        int ret = 0;
        int disable;
 
@@ -445,6 +464,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
                break;
        }
 
+       /* Enable or disable use of trace_buffered_event */
+       if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
+           (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
+               if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+                       trace_buffered_event_enable();
+               else
+                       trace_buffered_event_disable();
+       }
+
        return ret;
 }
 
@@ -471,24 +499,26 @@ static void ftrace_clear_events(struct trace_array *tr)
        mutex_unlock(&event_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
+/* Shouldn't this be in a header? */
+extern int pid_max;
+
+/* Returns true if found in filter */
+static bool
+find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 {
-       const pid_t *search_pid = key;
-       const pid_t *pid = elt;
+       /*
+        * If pid_max changed after filtered_pids was created, we
+        * by default ignore all pids greater than the previous pid_max.
+        */
+       if (search_pid >= filtered_pids->pid_max)
+               return false;
 
-       if (*search_pid == *pid)
-               return 0;
-       if (*search_pid < *pid)
-               return -1;
-       return 1;
+       return test_bit(search_pid, filtered_pids->pids);
 }
 
 static bool
-check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
-       pid_t search_pid;
-       pid_t *pid;
-
        /*
         * Return false, because if filtered_pids does not exist,
         * all pids are good to trace.
@@ -496,15 +526,68 @@ check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
        if (!filtered_pids)
                return false;
 
-       search_pid = task->pid;
+       return !find_filtered_pid(filtered_pids, task->pid);
+}
+
+static void filter_add_remove_task(struct trace_pid_list *pid_list,
+                                  struct task_struct *self,
+                                  struct task_struct *task)
+{
+       if (!pid_list)
+               return;
+
+       /* For forks, we only add if the forking task is listed */
+       if (self) {
+               if (!find_filtered_pid(pid_list, self->pid))
+                       return;
+       }
+
+       /* Sorry, but we don't support pid_max changing after setting */
+       if (task->pid >= pid_list->pid_max)
+               return;
+
+       /* "self" is set for forks, and NULL for exits */
+       if (self)
+               set_bit(task->pid, pid_list->pids);
+       else
+               clear_bit(task->pid, pid_list->pids);
+}
+
+static void
+event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
+{
+       struct trace_pid_list *pid_list;
+       struct trace_array *tr = data;
+
+       pid_list = rcu_dereference_sched(tr->filtered_pids);
+       filter_add_remove_task(pid_list, NULL, task);
+}
+
+static void
+event_filter_pid_sched_process_fork(void *data,
+                                   struct task_struct *self,
+                                   struct task_struct *task)
+{
+       struct trace_pid_list *pid_list;
+       struct trace_array *tr = data;
 
-       pid = bsearch(&search_pid, filtered_pids->pids,
-                     filtered_pids->nr_pids, sizeof(pid_t),
-                     cmp_pid);
-       if (!pid)
-               return true;
+       pid_list = rcu_dereference_sched(tr->filtered_pids);
+       filter_add_remove_task(pid_list, self, task);
+}
 
-       return false;
+void trace_event_follow_fork(struct trace_array *tr, bool enable)
+{
+       if (enable) {
+               register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
+                                                      tr, INT_MIN);
+               register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+                                                      tr, INT_MAX);
+       } else {
+               unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
+                                                   tr);
+               unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+                                                   tr);
+       }
 }
 
 static void
@@ -517,8 +600,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, prev) &&
-                      check_ignore_pid(pid_list, next));
+                      ignore_this_task(pid_list, prev) &&
+                      ignore_this_task(pid_list, next));
 }
 
 static void
@@ -531,7 +614,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, next));
+                      ignore_this_task(pid_list, next));
 }
 
 static void
@@ -547,7 +630,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, task));
+                      ignore_this_task(pid_list, task));
 }
 
 static void
@@ -564,7 +647,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
        /* Set tracing if current is enabled */
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, current));
+                      ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -602,7 +685,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
        /* Wait till all users are no longer using pid filtering */
        synchronize_sched();
 
-       free_pages((unsigned long)pid_list->pids, pid_list->order);
+       vfree(pid_list->pids);
        kfree(pid_list);
 }
 
@@ -946,11 +1029,32 @@ static void t_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct trace_array *tr = m->private;
+       struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+       unsigned long pid = (unsigned long)v;
+
+       (*pos)++;
+
+       /* pid already is +1 of the actual prevous bit */
+       pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+       /* Return pid + 1 to allow zero to be represented */
+       if (pid < pid_list->pid_max)
+               return (void *)(pid + 1);
+
+       return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
        __acquires(RCU)
 {
        struct trace_pid_list *pid_list;
        struct trace_array *tr = m->private;
+       unsigned long pid;
+       loff_t l = 0;
 
        /*
         * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +1067,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-       if (!pid_list || *pos >= pid_list->nr_pids)
+       if (!pid_list)
+               return NULL;
+
+       pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+       if (pid >= pid_list->pid_max)
                return NULL;
 
-       return (void *)&pid_list->pids[*pos];
+       /* Return pid + 1 so that zero can be the exit value */
+       for (pid++; pid && l < *pos;
+            pid = (unsigned long)p_next(m, (void *)pid, &l))
+               ;
+       return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +1088,11 @@ static void p_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
 
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
-       struct trace_array *tr = m->private;
-       struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-
-       (*pos)++;
-
-       if (*pos >= pid_list->nr_pids)
-               return NULL;
-
-       return (void *)&pid_list->pids[*pos];
-}
-
 static int p_show(struct seq_file *m, void *v)
 {
-       pid_t *pid = v;
+       unsigned long pid = (unsigned long)v - 1;
 
-       seq_printf(m, "%d\n", *pid);
+       seq_printf(m, "%lu\n", pid);
        return 0;
 }
 
@@ -1543,11 +1641,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
        return r;
 }
 
-static int max_pids(struct trace_pid_list *pid_list)
-{
-       return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
-}
-
 static void ignore_task_cpu(void *data)
 {
        struct trace_array *tr = data;
@@ -1561,7 +1654,7 @@ static void ignore_task_cpu(void *data)
                                             mutex_is_locked(&event_mutex));
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, current));
+                      ignore_this_task(pid_list, current));
 }
 
 static ssize_t
@@ -1571,7 +1664,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        struct seq_file *m = filp->private_data;
        struct trace_array *tr = m->private;
        struct trace_pid_list *filtered_pids = NULL;
-       struct trace_pid_list *pid_list = NULL;
+       struct trace_pid_list *pid_list;
        struct trace_event_file *file;
        struct trace_parser parser;
        unsigned long val;
@@ -1579,7 +1672,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        ssize_t read = 0;
        ssize_t ret = 0;
        pid_t pid;
-       int i;
+       int nr_pids = 0;
 
        if (!cnt)
                return 0;
@@ -1592,10 +1685,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
                return -ENOMEM;
 
        mutex_lock(&event_mutex);
+       filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+                                            lockdep_is_held(&event_mutex));
+
        /*
-        * Load as many pids into the array before doing a
-        * swap from the tr->filtered_pids to the new list.
+        * Always recreate a new array. The write is an all or nothing
+        * operation. Always create a new array when adding new pids by
+        * the user. If the operation fails, then the current list is
+        * not modified.
         */
+       pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+       if (!pid_list) {
+               read = -ENOMEM;
+               goto out;
+       }
+       pid_list->pid_max = READ_ONCE(pid_max);
+       /* Only truncating will shrink pid_max */
+       if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+               pid_list->pid_max = filtered_pids->pid_max;
+       pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+       if (!pid_list->pids) {
+               kfree(pid_list);
+               read = -ENOMEM;
+               goto out;
+       }
+       if (filtered_pids) {
+               /* copy the current bits to the new max */
+               pid = find_first_bit(filtered_pids->pids,
+                                    filtered_pids->pid_max);
+               while (pid < filtered_pids->pid_max) {
+                       set_bit(pid, pid_list->pids);
+                       pid = find_next_bit(filtered_pids->pids,
+                                           filtered_pids->pid_max,
+                                           pid + 1);
+                       nr_pids++;
+               }
+       }
+
        while (cnt > 0) {
 
                this_pos = 0;
@@ -1613,92 +1739,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
                ret = -EINVAL;
                if (kstrtoul(parser.buffer, 0, &val))
                        break;
-               if (val > INT_MAX)
+               if (val >= pid_list->pid_max)
                        break;
 
                pid = (pid_t)val;
 
-               ret = -ENOMEM;
-               if (!pid_list) {
-                       pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-                       if (!pid_list)
-                               break;
-
-                       filtered_pids = rcu_dereference_protected(tr->filtered_pids,
-                                                       lockdep_is_held(&event_mutex));
-                       if (filtered_pids)
-                               pid_list->order = filtered_pids->order;
-                       else
-                               pid_list->order = 0;
-
-                       pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
-                                                                 pid_list->order);
-                       if (!pid_list->pids)
-                               break;
-
-                       if (filtered_pids) {
-                               pid_list->nr_pids = filtered_pids->nr_pids;
-                               memcpy(pid_list->pids, filtered_pids->pids,
-                                      pid_list->nr_pids * sizeof(pid_t));
-                       } else
-                               pid_list->nr_pids = 0;
-               }
-
-               if (pid_list->nr_pids >= max_pids(pid_list)) {
-                       pid_t *pid_page;
-
-                       pid_page = (void *)__get_free_pages(GFP_KERNEL,
-                                                           pid_list->order + 1);
-                       if (!pid_page)
-                               break;
-                       memcpy(pid_page, pid_list->pids,
-                              pid_list->nr_pids * sizeof(pid_t));
-                       free_pages((unsigned long)pid_list->pids, pid_list->order);
-
-                       pid_list->order++;
-                       pid_list->pids = pid_page;
-               }
+               set_bit(pid, pid_list->pids);
+               nr_pids++;
 
-               pid_list->pids[pid_list->nr_pids++] = pid;
                trace_parser_clear(&parser);
                ret = 0;
        }
        trace_parser_put(&parser);
 
        if (ret < 0) {
-               if (pid_list)
-                       free_pages((unsigned long)pid_list->pids, pid_list->order);
+               vfree(pid_list->pids);
                kfree(pid_list);
-               mutex_unlock(&event_mutex);
-               return ret;
-       }
-
-       if (!pid_list) {
-               mutex_unlock(&event_mutex);
-               return ret;
+               read = ret;
+               goto out;
        }
 
-       sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
-
-       /* Remove duplicates */
-       for (i = 1; i < pid_list->nr_pids; i++) {
-               int start = i;
-
-               while (i < pid_list->nr_pids &&
-                      pid_list->pids[i - 1] == pid_list->pids[i])
-                       i++;
-
-               if (start != i) {
-                       if (i < pid_list->nr_pids) {
-                               memmove(&pid_list->pids[start], &pid_list->pids[i],
-                                       (pid_list->nr_pids - i) * sizeof(pid_t));
-                               pid_list->nr_pids -= i - start;
-                               i = start;
-                       } else
-                               pid_list->nr_pids = start;
-               }
+       if (!nr_pids) {
+               /* Cleared the list of pids */
+               vfree(pid_list->pids);
+               kfree(pid_list);
+               read = ret;
+               if (!filtered_pids)
+                       goto out;
+               pid_list = NULL;
        }
-
        rcu_assign_pointer(tr->filtered_pids, pid_list);
 
        list_for_each_entry(file, &tr->events, list) {
@@ -1708,7 +1777,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        if (filtered_pids) {
                synchronize_sched();
 
-               free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+               vfree(filtered_pids->pids);
                kfree(filtered_pids);
        } else {
                /*
@@ -1745,10 +1814,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
         */
        on_each_cpu(ignore_task_cpu, tr, 1);
 
+ out:
        mutex_unlock(&event_mutex);
 
        ret = read;
-       *ppos += read;
+       if (read > 0)
+               *ppos += read;
 
        return ret;
 }
@@ -2103,6 +2174,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
                trace_create_file("trigger", 0644, file->dir, file,
                                  &event_trigger_fops);
 
+#ifdef CONFIG_HIST_TRIGGERS
+       trace_create_file("hist", 0444, file->dir, file,
+                         &event_hist_fops);
+#endif
        trace_create_file("format", 0444, file->dir, call,
                          &ftrace_event_format_fops);
 
@@ -3350,7 +3425,7 @@ static __init void event_trace_self_tests(void)
 
 static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
 
-static struct trace_array *event_tr;
+static struct trace_event_file event_trace_file __initdata;
 
 static void __init
 function_test_events_call(unsigned long ip, unsigned long parent_ip,
@@ -3374,17 +3449,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 
        local_save_flags(flags);
 
-       event = trace_current_buffer_lock_reserve(&buffer,
-                                                 TRACE_FN, sizeof(*entry),
-                                                 flags, pc);
+       event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
+                                               TRACE_FN, sizeof(*entry),
+                                               flags, pc);
        if (!event)
                goto out;
        entry   = ring_buffer_event_data(event);
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
 
-       trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc);
-
+       event_trigger_unlock_commit(&event_trace_file, buffer, event,
+                                   entry, flags, pc);
  out:
        atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
        preempt_enable_notrace();
@@ -3399,9 +3474,11 @@ static struct ftrace_ops trace_ops __initdata  =
 static __init void event_trace_self_test_with_function(void)
 {
        int ret;
-       event_tr = top_trace_array();
-       if (WARN_ON(!event_tr))
+
+       event_trace_file.tr = top_trace_array();
+       if (WARN_ON(!event_trace_file.tr))
                return;
+
        ret = register_ftrace_function(&trace_ops);
        if (WARN_ON(ret < 0)) {
                pr_info("Failed to enable function tracer for event tests\n");