Merge branch 'linus' into tracing/core
authorIngo Molnar <mingo@elte.hu>
Tue, 11 Aug 2009 12:19:09 +0000 (14:19 +0200)
committerIngo Molnar <mingo@elte.hu>
Tue, 11 Aug 2009 12:19:09 +0000 (14:19 +0200)
Conflicts:
kernel/trace/trace_events_filter.c

We use the tracing/core version.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
24 files changed:
Documentation/kernel-parameters.txt
Documentation/trace/events.txt
Documentation/trace/ring-buffer-design.txt [new file with mode: 0644]
arch/x86/kernel/ftrace.c
include/linux/ftrace_event.h
include/linux/ring_buffer.h
include/trace/ftrace.h
kernel/kprobes.c
kernel/trace/ftrace.c
kernel/trace/kmemtrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_functions.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_sched_switch.c
kernel/trace/trace_selftest.c
kernel/trace/trace_stack.c
kernel/trace/trace_stat.c
kernel/trace/trace_stat.h
kernel/trace/trace_workqueue.c
scripts/recordmcount.pl

index dd1a6d4bb7473c4966a720c40e8af0492f9853c5..81cdb7d5e3801a604c3da0686838559068c25403 100644 (file)
@@ -2476,6 +2476,11 @@ and is between 256 and 4096 characters. It is defined in the file
        trace_buf_size=nn[KMG]
                        [FTRACE] will set tracing buffer size.
 
+       trace_event=[event-list]
+                       [FTRACE] Set and start specified trace events in order
+                       to facilitate early boot debugging.
+                       See also Documentation/trace/events.txt
+
        trix=           [HW,OSS] MediaTrix AudioTrix Pro
                        Format:
                        <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
index f157d7594ea7c2bdf991b25f84b5580acec505e3..2bcc8d4dea29ff0cf77a9caf3e77838fe0534cd7 100644 (file)
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results:
  X - there is a mixture of events enabled and disabled
  ? - this file does not affect any event
 
+2.3 Boot option
+---------------
+
+In order to facilitate early boot debugging, use boot option:
+
+       trace_event=[event-list]
+
+The format of this boot option is the same as described in section 2.1.
+
 3. Defining an event-enabled tracepoint
 =======================================
 
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644 (file)
index 0000000..5b1d23d
--- /dev/null
@@ -0,0 +1,955 @@
+               Lockless Ring Buffer Design
+               ===========================
+
+Copyright 2009 Red Hat Inc.
+   Author:   Steven Rostedt <srostedt@redhat.com>
+  License:   The GNU Free Documentation License, Version 1.2
+               (dual licensed under the GPL v2)
+Reviewers:   Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
+            and Frederic Weisbecker.
+
+
+Written for: 2.6.31
+
+Terminology used in this Document
+---------------------------------
+
+tail - where new writes happen in the ring buffer.
+
+head - where new reads happen in the ring buffer.
+
+producer - the task that writes into the ring buffer (same as writer)
+
+writer - same as producer
+
+consumer - the task that reads from the buffer (same as reader)
+
+reader - same as consumer.
+
+reader_page - A page outside the ring buffer used solely (for the most part)
+    by the reader.
+
+head_page - a pointer to the page that the reader will use next
+
+tail_page - a pointer to the page that will be written to next
+
+commit_page - a pointer to the page with the last finished non nested write.
+
+cmpxchg - hardware assisted atomic transaction that performs the following:
+
+   A = B iff previous A == C
+
+   R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
+      current A is equal to C, and we put the old (current) A into R
+
+   R gets the previous A regardless if A is updated with B or not.
+
+   To see if the update was successful a compare of R == C may be used.
+
+The Generic Ring Buffer
+-----------------------
+
+The ring buffer can be used in either an overwrite mode or in
+producer/consumer mode.
+
+Producer/consumer mode is where the producer were to fill up the
+buffer before the consumer could free up anything, the producer
+will stop writing to the buffer. This will lose most recent events.
+
+Overwrite mode is where the produce were to fill up the buffer
+before the consumer could free up anything, the producer will
+overwrite the older data. This will lose the oldest events.
+
+No two writers can write at the same time (on the same per cpu buffer),
+but a writer may interrupt another writer, but it must finish writing
+before the previous writer may continue. This is very important to the
+algorithm. The writers act like a "stack". The way interrupts works
+enforces this behavior.
+
+
+  writer1 start
+     <preempted> writer2 start
+         <preempted> writer3 start
+                     writer3 finishes
+                 writer2 finishes
+  writer1 finishes
+
+This is very much like a writer being preempted by an interrupt and
+the interrupt doing a write as well.
+
+Readers can happen at any time. But no two readers may run at the
+same time, nor can a reader preempt/interrupt another reader. A reader
+can not preempt/interrupt a writer, but it may read/consume from the
+buffer at the same time as a writer is writing, but the reader must be
+on another processor to do so. A reader may read on its own processor
+and can be preempted by a writer.
+
+A writer can preempt a reader, but a reader can not preempt a writer.
+But a reader can read the buffer at the same time (on another processor)
+as a writer.
+
+The ring buffer is made up of a list of pages held together by a link list.
+
+At initialization a reader page is allocated for the reader that is not
+part of the ring buffer.
+
+The head_page, tail_page and commit_page are all initialized to point
+to the same page.
+
+The reader page is initialized to have its next pointer pointing to
+the head page, and its previous pointer pointing to a page before
+the head page.
+
+The reader has its own page to use. At start up time, this page is
+allocated but is not attached to the list. When the reader wants
+to read from the buffer, if its page is empty (like it is on start up)
+it will swap its page with the head_page. The old reader page will
+become part of the ring buffer and the head_page will be removed.
+The page after the inserted page (old reader_page) will become the
+new head page.
+
+Once the new page is given to the reader, the reader could do what
+it wants with it, as long as a writer has left that page.
+
+A sample of how the reader page is swapped: Note this does not
+show the head page in the buffer, it is for demonstrating a swap
+only.
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |
+  +------+
+                  +---+   +---+   +---+
+                  |   |-->|   |-->|   |
+                  |   |<--|   |<--|   |
+                  +---+   +---+   +---+
+                   ^ |             ^ |
+                   | +-------------+ |
+                   +-----------------+
+
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------------------+
+  +------+                   v
+    |             +---+   +---+   +---+
+    |             |   |-->|   |-->|   |
+    |             |   |<--|   |<--|   |<-+
+    |             +---+   +---+   +---+  |
+    |              ^ |             ^ |   |
+    |              | +-------------+ |   |
+    |              +-----------------+   |
+    +------------------------------------+
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------------------+
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |-->|   |-->|   |
+    |  |          |   |   |   |<--|   |<-+
+    |  |          +---+   +---+   +---+  |
+    |  |             |             ^ |   |
+    |  |             +-------------+ |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+  +------+
+  |buffer|          RING BUFFER
+  |page  |-------------------+
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |   |   |-->|   |
+    |  |  New     |   |   |   |<--|   |<-+
+    |  | Reader   +---+   +---+   +---+  |
+    |  |  page ----^                 |   |
+    |  |                             |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+
+
+It is possible that the page swapped is the commit page and the tail page,
+if what is in the ring buffer is less than what is held in a buffer page.
+
+
+          reader page    commit page   tail page
+              |              |             |
+              v              |             |
+             +---+           |             |
+             |   |<----------+             |
+             |   |<------------------------+
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+This case is still valid for this algorithm.
+When the writer leaves the page, it simply goes into the ring buffer
+since the reader page still points to the next location in the ring
+buffer.
+
+
+The main pointers:
+
+  reader page - The page used solely by the reader and is not part
+                of the ring buffer (may be swapped in)
+
+  head page - the next page in the ring buffer that will be swapped
+              with the reader page.
+
+  tail page - the page where the next write will take place.
+
+  commit page - the page that last finished a write.
+
+The commit page only is updated by the outer most writer in the
+writer stack. A writer that preempts another writer will not move the
+commit page.
+
+When data is written into the ring buffer, a position is reserved
+in the ring buffer and passed back to the writer. When the writer
+is finished writing data into that position, it commits the write.
+
+Another write (or a read) may take place at anytime during this
+transaction. If another write happens it must finish before continuing
+with the previous write.
+
+
+   Write reserve:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+  <--- given back to writer (current commit)
+      |reserved |
+      +---------+ <--- tail pointer
+      | empty   |
+      +---------+
+
+   Write commit:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+
+      |written  |
+      +---------+  <--- next positon for write (current commit)
+      | empty   |
+      +---------+
+
+
+ If a write happens after the first reserve:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+  <-- current commit
+      |reserved |
+      +---------+  <--- given back to second writer
+      |reserved |
+      +---------+ <--- tail pointer
+
+  After second writer commits:
+
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+  <--(last full commit)
+      |reserved |
+      +---------+
+      |pending  |
+      |commit   |
+      +---------+ <--- tail pointer
+
+  When the first writer commits:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+
+      |written  |
+      +---------+
+      |written  |
+      +---------+  <--(last full commit and tail pointer)
+
+
+The commit pointer points to the last write location that was
+committed without preempting another write. When a write that
+preempted another write is committed, it only becomes a pending commit
+and will not be a full commit till all writes have been committed.
+
+The commit page points to the page that has the last full commit.
+The tail page points to the page with the last write (before
+committing).
+
+The tail page is always equal to or after the commit page. It may
+be several pages ahead. If the tail page catches up to the commit
+page then no more writes may take place (regardless of the mode
+of the ring buffer: overwrite and produce/consumer).
+
+The order of pages are:
+
+ head page
+ commit page
+ tail page
+
+Possible scenario:
+                             tail page
+  head page         commit page  |
+      |                 |        |
+      v                 v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+There is a special case that the head page is after either the commit page
+and possibly the tail page. That is when the commit (and tail) page has been
+swapped with the reader page. This is because the head page is always
+part of the ring buffer, but the reader page is not. When ever there
+has been less than a full page that has been committed inside the ring buffer,
+and a reader swaps out a page, it will be swapping out the commit page.
+
+
+          reader page    commit page   tail page
+              |              |             |
+              v              |             |
+             +---+           |             |
+             |   |<----------+             |
+             |   |<------------------------+
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                        ^
+                        |
+                    head page
+
+
+In this case, the head page will not move when the tail and commit
+move back into the ring buffer.
+
+The reader can not swap a page into the ring buffer if the commit page
+is still on that page. If the read meets the last commit (real commit
+not pending or reserved), then there is nothing more to read.
+The buffer is considered empty until another full commit finishes.
+
+When the tail meets the head page, if the buffer is in overwrite mode,
+the head page will be pushed ahead one. If the buffer is in producer/consumer
+mode, the write will fail.
+
+Overwrite mode:
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                        ^
+                        |
+                    head page
+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                                 ^
+                                 |
+                             head page
+
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                                 ^
+                                 |
+                             head page
+
+Note, the reader page will still point to the previous head page.
+But when a swap takes place, it will use the most recent head page.
+
+
+Making the Ring Buffer Lockless:
+--------------------------------
+
+The main idea behind the lockless algorithm is to combine the moving
+of the head_page pointer with the swapping of pages with the reader.
+State flags are placed inside the pointer to the page. To do this,
+each page must be aligned in memory by 4 bytes. This will allow the 2
+least significant bits of the address to be used as flags. Since
+they will always be zero for the address. To get the address,
+simply mask out the flags.
+
+  MASK = ~3
+
+  address & MASK
+
+Two flags will be kept by these two bits:
+
+   HEADER - the page being pointed to is a head page
+
+   UPDATE - the page being pointed to is being updated by a writer
+          and was or is about to be a head page.
+
+
+          reader page
+              |
+              v
+             +---+
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+The above pointer "-H->" would have the HEADER flag set. That is
+the next page is the next page to be swapped out by the reader.
+This pointer means the next page is the head page.
+
+When the tail page meets the head pointer, it will use cmpxchg to
+change the pointer to the UPDATE state:
+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+"-U->" represents a pointer in the UPDATE state.
+
+Any access to the reader will need to take some sort of lock to serialize
+the readers. But the writers will never take a lock to write to the
+ring buffer. This means we only need to worry about a single reader,
+and writes only preempt in "stack" formation.
+
+When the reader tries to swap the page with the ring buffer, it
+will also use cmpxchg. If the flag bit in the pointer to the
+head page does not have the HEADER flag set, the compare will fail
+and the reader will need to look for the new head page and try again.
+Note, the flag UPDATE and HEADER are never set at the same time.
+
+The reader swaps the reader page as follows:
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |
+  +------+
+                  +---+    +---+    +---+
+                  |   |--->|   |--->|   |
+                  |   |<---|   |<---|   |
+                  +---+    +---+    +---+
+                   ^ |               ^ |
+                   | +---------------+ |
+                   +-----H-------------+
+
+The reader sets the reader page next pointer as HEADER to the page after
+the head page.
+
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------H-----------+
+  +------+                   v
+    |             +---+    +---+    +---+
+    |             |   |--->|   |--->|   |
+    |             |   |<---|   |<---|   |<-+
+    |             +---+    +---+    +---+  |
+    |              ^ |               ^ |   |
+    |              | +---------------+ |   |
+    |              +-----H-------------+   |
+    +--------------------------------------+
+
+It does a cmpxchg with the pointer to the previous head page to make it
+point to the reader page. Note that the new pointer does not have the HEADER
+flag set.  This action atomically moves the head page forward.
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------H-----------+
+  +------+                   v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |-->|   |-->|   |
+    |  |          |   |<--|   |<--|   |<-+
+    |  |          +---+   +---+   +---+  |
+    |  |             |             ^ |   |
+    |  |             +-------------+ |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+After the new head page is set, the previous pointer of the head page is
+updated to the reader page.
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------H-----------+
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |-->|   |-->|   |
+    |  |          |   |   |   |<--|   |<-+
+    |  |          +---+   +---+   +---+  |
+    |  |             |             ^ |   |
+    |  |             +-------------+ |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+  +------+
+  |buffer|          RING BUFFER
+  |page  |-------H-----------+  <--- New head page
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |   |   |-->|   |
+    |  |  New     |   |   |   |<--|   |<-+
+    |  | Reader   +---+   +---+   +---+  |
+    |  |  page ----^                 |   |
+    |  |                             |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+Another important point. The page that the reader page points back to
+by its previous pointer (the one that now points to the new head page)
+never points back to the reader page. That is because the reader page is
+not part of the ring buffer. Traversing the ring buffer via the next pointers
+will always stay in the ring buffer. Traversing the ring buffer via the
+prev pointers may not.
+
+Note, the way to determine a reader page is simply by examining the previous
+pointer of the page. If the next pointer of the previous page does not
+point back to the original page, then the original page is a reader page:
+
+
+             +--------+
+             | reader |  next   +----+
+             |  page  |-------->|    |<====== (buffer page)
+             +--------+         +----+
+                 |                | ^
+                 |                v | next
+            prev |              +----+
+                 +------------->|    |
+                                +----+
+
+The way the head page moves forward:
+
+When the tail page meets the head page and the buffer is in overwrite mode
+and more writes take place, the head page must be moved forward before the
+writer may move the tail page. The way this is done is that the writer
+performs a cmpxchg to convert the pointer to the head page from the HEADER
+flag to have the UPDATE flag set. Once this is done, the reader will
+not be able to swap the head page from the buffer, nor will it be able to
+move the head page, until the writer is finished with the move.
+
+This eliminates any races that the reader can have on the writer. The reader
+must spin, and this is why the reader can not preempt the writer.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The following page will be made into the new head page.
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+After the new head page has been set, we can set the old head page
+pointer back to NORMAL.
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+After the head page has been moved, the tail page may now move forward.
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+The above are the trivial updates. Now for the more complex scenarios.
+
+
+As stated before, if enough writes preempt the first write, the
+tail page may make it all the way around the buffer and meet the commit
+page. At this time, we must start dropping writes (usually with some kind
+of warning to the user). But what happens if the commit was still on the
+reader page? The commit page is not part of the ring buffer. The tail page
+must account for this.
+
+
+          reader page    commit page
+              |              |
+              v              |
+             +---+           |
+             |   |<----------+
+             |   |
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+               ^
+               |
+           tail page
+
+If the tail page were to simply push the head page forward, the commit when
+leaving the reader page would not be pointing to the correct page.
+
+The solution to this is to test if the commit page is on the reader page
+before pushing the head page. If it is, then it can be assumed that the
+tail page wrapped the buffer, and we must drop new writes.
+
+This is not a race condition, because the commit page can only be moved
+by the outter most writer (the writer that was preempted).
+This means that the commit will not move while a writer is moving the
+tail page. The reader can not swap the reader page if it is also being
+used as the commit page. The reader can simply check that the commit
+is off the reader page. Once the commit page leaves the reader page
+it will never go back on it unless a reader does another swap with the
+buffer page that is also the commit page.
+
+
+Nested writes
+-------------
+
+In the pushing forward of the tail page we must first push forward
+the head page if the head page is the next page. If the head page
+is not the next page, the tail page is simply updated with a cmpxchg.
+
+Only writers move the tail page. This must be done atomically to protect
+against nested writers.
+
+  temp_page = tail_page
+  next_page = temp_page->next
+  cmpxchg(tail_page, temp_page, next_page)
+
+The above will update the tail page if it is still pointing to the expected
+page. If this fails, a nested write pushed it forward, the the current write
+does not need to push it.
+
+
+           temp page
+               |
+               v
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Nested write comes in and moves the tail page forward:
+
+                    tail page (moved by nested writer)
+            temp page   |
+               |        |
+               v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The above would fail the cmpxchg, but since the tail page has already
+been moved forward, the writer will just try again to reserve storage
+on the new tail page.
+
+But the moving of the head page is a bit more complex.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The write converts the head page pointer to UPDATE.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+But if a nested writer preempts here. It will see that the next
+page is a head page, but it is also nested. It will detect that
+it is nested and will save that information. The detection is the
+fact that it sees the UPDATE flag instead of a HEADER or NORMAL
+pointer.
+
+The nested writer will set the new head page pointer.
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+But it will not reset the update back to normal. Only the writer
+that converted a pointer from HEAD to UPDATE will convert it back
+to NORMAL.
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+After the nested writer finishes, the outer most writer will convert
+the UPDATE pointer to NORMAL.
+
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+It can be even more complex if several nested writes came in and moved
+the tail page ahead several pages:
+
+
+(first writer)
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The write converts the head page pointer to UPDATE.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Next writer comes in, and sees the update and sets up the new
+head page.
+
+(second writer)
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The nested writer moves the tail page forward. But does not set the old
+update page to NORMAL because it is not the outer most writer.
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Another writer preempts and sees the page after the tail page is a head page.
+It changes it from HEAD to UPDATE.
+
+(third writer)
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-U->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The writer will move the head page forward:
+
+
+(third writer)
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-U->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+But now that the third writer did change the HEAD flag to UPDATE it
+will convert it to normal:
+
+
+(third writer)
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+Then it will move the tail page, and return back to the second writer.
+
+
+(second writer)
+
+                             tail page
+                                 |
+                                 v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+The second writer will fail to move the tail page because it was already
+moved, so it will try again and add its data to the new tail page.
+It will return to the first writer.
+
+
+(first writer)
+
+                             tail page
+                                 |
+                                 v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The first writer can not know atomically test if the tail page moved
+while it updates the HEAD page. It will then update the head page to
+what it thinks is the new head page.
+
+
+(first writer)
+
+                             tail page
+                                 |
+                                 v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Since the cmpxchg returns the old value of the pointer the first writer
+will see it succeeded in updating the pointer from NORMAL to HEAD.
+But as we can see, this is not good enough. It must also check to see
+if the tail page is either where it use to be or on the next page:
+
+
+(first writer)
+
+               A        B    tail page
+               |        |        |
+               v        v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+If tail page != A and tail page does not equal B, then it must reset the
+pointer back to NORMAL. The fact that it only needs to worry about
+nested writers, it only needs to check this after setting the HEAD page.
+
+
+(first writer)
+
+               A        B    tail page
+               |        |        |
+               v        v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Now the writer can update the head page. This is also why the head page must
+remain in UPDATE and only reset by the outer most writer. This prevents
+the reader from seeing the incorrect head page.
+
+
+(first writer)
+
+               A        B    tail page
+               |        |        |
+               v        v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
index d94e1ea3b9fe03557f99494fd4ba123143103233..8e9663413b7f1c863535e6538f0612ca4b71bb55 100644 (file)
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
        unsigned long return_hooker = (unsigned long)
                                &return_to_handler;
 
-       /* Nmi's are currently unsupported */
-       if (unlikely(in_nmi()))
-               return;
-
        if (unlikely(atomic_read(&current->tracing_graph_pause)))
                return;
 
index a81170de7f6bf4907faf644495831aad0c8fc30a..ac8c6f8cf242e39bd6e489221f91014dbfe10d55 100644 (file)
@@ -103,6 +103,8 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
+struct event_filter;
+
 struct ftrace_event_call {
        struct list_head        list;
        char                    *name;
@@ -118,7 +120,7 @@ struct ftrace_event_call {
        int                     (*define_fields)(void);
        struct list_head        fields;
        int                     filter_active;
-       void                    *filter;
+       struct event_filter     *filter;
        void                    *mod;
 
        atomic_t                profile_count;
index 29f8599e6bea0db5ef5e963511925138a06b2ad1..7fca71693ae72356f08a91b490b389bb9726d84f 100644 (file)
@@ -170,7 +170,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
-unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
index f64fbaae781aa0e8fa78cf5848b2bb81add4721b..25d3b02a06f8a52f3a624d8f9a99c89ef883984e 100644 (file)
@@ -25,7 +25,7 @@
 #define __array(type, item, len)       type    item[len];
 
 #undef __dynamic_array
-#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
+#define __dynamic_array(type, item, len) u32 __data_loc_##item;
 
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
  * Include the following:
  *
  * struct ftrace_data_offsets_<call> {
- *     int                             <item1>;
- *     int                             <item2>;
+ *     u32                             <item1>;
+ *     u32                             <item2>;
  *     [...]
  * };
  *
- * The __dynamic_array() macro will create each int <item>, this is
+ * The __dynamic_array() macro will create each u32 <item>, this is
  * to keep the offset of each array from the beginning of the event.
+ * The size of an array is also encoded, in the higher 16 bits of <item>.
  */
 
 #undef __field
@@ -67,7 +68,7 @@
 #define __array(type, item, len)
 
 #undef __dynamic_array
-#define __dynamic_array(type, item, len)       int item;
+#define __dynamic_array(type, item, len)       u32 item;
 
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef __dynamic_array
 #define __dynamic_array(type, item, len)                                      \
-       ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"            \
+       ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
                               "offset:%u;\tsize:%u;\n",                       \
                               (unsigned int)offsetof(typeof(field),           \
                                        __data_loc_##item),                    \
@@ -210,7 +211,7 @@ ftrace_format_##call(struct trace_seq *s)                           \
 
 #undef __get_dynamic_array
 #define __get_dynamic_array(field)     \
-               ((void *)__entry + __entry->__data_loc_##field)
+               ((void *)__entry + (__entry->__data_loc_##field & 0xffff))
 
 #undef __get_str
 #define __get_str(field) (char *)__get_dynamic_array(field)
@@ -282,7 +283,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)    \
 
 #undef __dynamic_array
 #define __dynamic_array(type, item, len)                                      \
-       ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
+       ret = trace_define_field(event_call, "__data_loc " #type "[]", #item,  \
                                offsetof(typeof(field), __data_loc_##item),    \
                                 sizeof(field.__data_loc_##item), 0);
 
@@ -328,6 +329,7 @@ ftrace_define_fields_##call(void)                                   \
 #define __dynamic_array(type, item, len)                               \
        __data_offsets->item = __data_size +                            \
                               offsetof(typeof(*entry), __data);        \
+       __data_offsets->item |= (len * sizeof(type)) << 16;             \
        __data_size += (len) * sizeof(type);
 
 #undef __string
index 0540948e29abf0217fc7e4edded97874aa26605c..ef177d653b2cb791b11f98723d3afe5ada5ac586 100644 (file)
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 
 struct kprobe_insn_page {
-       struct hlist_node hlist;
+       struct list_head list;
        kprobe_opcode_t *insns;         /* Page of instruction slots */
        char slot_used[INSNS_PER_PAGE];
        int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
 };
 
 static DEFINE_MUTEX(kprobe_insn_mutex);        /* Protects kprobe_insn_pages */
-static struct hlist_head kprobe_insn_pages;
+static LIST_HEAD(kprobe_insn_pages);
 static int kprobe_garbage_slots;
 static int collect_garbage_slots(void);
 
@@ -152,10 +152,9 @@ loop_end:
 static kprobe_opcode_t __kprobes *__get_insn_slot(void)
 {
        struct kprobe_insn_page *kip;
-       struct hlist_node *pos;
 
  retry:
-       hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+       list_for_each_entry(kip, &kprobe_insn_pages, list) {
                if (kip->nused < INSNS_PER_PAGE) {
                        int i;
                        for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
                kfree(kip);
                return NULL;
        }
-       INIT_HLIST_NODE(&kip->hlist);
-       hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+       INIT_LIST_HEAD(&kip->list);
+       list_add(&kip->list, &kprobe_insn_pages);
        memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
-               hlist_del(&kip->hlist);
-               if (hlist_empty(&kprobe_insn_pages)) {
-                       INIT_HLIST_NODE(&kip->hlist);
-                       hlist_add_head(&kip->hlist,
-                                      &kprobe_insn_pages);
-               } else {
+               if (!list_is_singular(&kprobe_insn_pages)) {
+                       list_del(&kip->list);
                        module_free(NULL, kip->insns);
                        kfree(kip);
                }
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
 
 static int __kprobes collect_garbage_slots(void)
 {
-       struct kprobe_insn_page *kip;
-       struct hlist_node *pos, *next;
+       struct kprobe_insn_page *kip, *next;
 
        /* Ensure no-one is preepmted on the garbages */
        if (check_safety())
                return -EAGAIN;
 
-       hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
+       list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
 void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 {
        struct kprobe_insn_page *kip;
-       struct hlist_node *pos;
 
        mutex_lock(&kprobe_insn_mutex);
-       hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+       list_for_each_entry(kip, &kprobe_insn_pages, list) {
                if (kip->insns <= slot &&
                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
                        if (dirty) {
                                kip->slot_used[i] = SLOT_DIRTY;
                                kip->ngarbage++;
-                       } else {
+                       } else
                                collect_one_slot(kip, i);
-                       }
                        break;
                }
        }
index 1e1d23c263086635ec225dd762aa47d880002b35..094863416b2ecfbafec1e4f8f5feb8b65c0dfbcf 100644 (file)
@@ -1016,71 +1016,35 @@ static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
        unsigned long ftrace_addr;
-       unsigned long ip, fl;
+       unsigned long flag = 0UL;
 
        ftrace_addr = (unsigned long)FTRACE_ADDR;
 
-       ip = rec->ip;
-
        /*
-        * If this record is not to be traced and
-        * it is not enabled then do nothing.
+        * If this record is not to be traced or we want to disable it,
+        * then disable it.
         *
-        * If this record is not to be traced and
-        * it is enabled then disable it.
+        * If we want to enable it and filtering is off, then enable it.
         *
+        * If we want to enable it and filtering is on, enable it only if
+        * it's filtered
         */
-       if (rec->flags & FTRACE_FL_NOTRACE) {
-               if (rec->flags & FTRACE_FL_ENABLED)
-                       rec->flags &= ~FTRACE_FL_ENABLED;
-               else
-                       return 0;
-
-       } else if (ftrace_filtered && enable) {
-               /*
-                * Filtering is on:
-                */
-
-               fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
-
-               /* Record is filtered and enabled, do nothing */
-               if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
-                       return 0;
-
-               /* Record is not filtered or enabled, do nothing */
-               if (!fl)
-                       return 0;
-
-               /* Record is not filtered but enabled, disable it */
-               if (fl == FTRACE_FL_ENABLED)
-                       rec->flags &= ~FTRACE_FL_ENABLED;
-               else
-               /* Otherwise record is filtered but not enabled, enable it */
-                       rec->flags |= FTRACE_FL_ENABLED;
-       } else {
-               /* Disable or not filtered */
-
-               if (enable) {
-                       /* if record is enabled, do nothing */
-                       if (rec->flags & FTRACE_FL_ENABLED)
-                               return 0;
-
-                       rec->flags |= FTRACE_FL_ENABLED;
-
-               } else {
+       if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
+               if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
+                       flag = FTRACE_FL_ENABLED;
+       }
 
-                       /* if record is not enabled, do nothing */
-                       if (!(rec->flags & FTRACE_FL_ENABLED))
-                               return 0;
+       /* If the state of this record hasn't changed, then do nothing */
+       if ((rec->flags & FTRACE_FL_ENABLED) == flag)
+               return 0;
 
-                       rec->flags &= ~FTRACE_FL_ENABLED;
-               }
+       if (flag) {
+               rec->flags |= FTRACE_FL_ENABLED;
+               return ftrace_make_call(rec, ftrace_addr);
        }
 
-       if (rec->flags & FTRACE_FL_ENABLED)
-               return ftrace_make_call(rec, ftrace_addr);
-       else
-               return ftrace_make_nop(NULL, rec, ftrace_addr);
+       rec->flags &= ~FTRACE_FL_ENABLED;
+       return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
 static void ftrace_replace_code(int enable)
@@ -1375,7 +1339,6 @@ struct ftrace_iterator {
        unsigned                flags;
        unsigned char           buffer[FTRACE_BUFF_MAX+1];
        unsigned                buffer_idx;
-       unsigned                filtered;
 };
 
 static void *
@@ -1438,18 +1401,13 @@ static int t_hash_show(struct seq_file *m, void *v)
 {
        struct ftrace_func_probe *rec;
        struct hlist_node *hnd = v;
-       char str[KSYM_SYMBOL_LEN];
 
        rec = hlist_entry(hnd, struct ftrace_func_probe, node);
 
        if (rec->ops->print)
                return rec->ops->print(m, rec->ip, rec->ops, rec->data);
 
-       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-       seq_printf(m, "%s:", str);
-
-       kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
-       seq_printf(m, "%s", str);
+       seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
 
        if (rec->data)
                seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1505,6 @@ static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_iterator *iter = m->private;
        struct dyn_ftrace *rec = v;
-       char str[KSYM_SYMBOL_LEN];
 
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_show(m, v);
@@ -1560,9 +1517,7 @@ static int t_show(struct seq_file *m, void *v)
        if (!rec)
                return 0;
 
-       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-
-       seq_printf(m, "%s\n", str);
+       seq_printf(m, "%pf\n", (void *)rec->ip);
 
        return 0;
 }
@@ -2312,7 +2267,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        }
 
        if (isspace(ch)) {
-               iter->filtered++;
                iter->buffer[iter->buffer_idx] = 0;
                ret = ftrace_process_regex(iter->buffer,
                                           iter->buffer_idx, enable);
@@ -2443,7 +2397,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
                iter = file->private_data;
 
        if (iter->buffer_idx) {
-               iter->filtered++;
                iter->buffer[iter->buffer_idx] = 0;
                ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
        }
@@ -2543,7 +2496,6 @@ static void g_stop(struct seq_file *m, void *p)
 static int g_show(struct seq_file *m, void *v)
 {
        unsigned long *ptr = v;
-       char str[KSYM_SYMBOL_LEN];
 
        if (!ptr)
                return 0;
@@ -2553,9 +2505,7 @@ static int g_show(struct seq_file *m, void *v)
                return 0;
        }
 
-       kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
-
-       seq_printf(m, "%s\n", str);
+       seq_printf(m, "%pf\n", v);
 
        return 0;
 }
index 1edaa9516e8153b8210398143c37dab20bd28777..dda53ccf749b979e3959a698eb67fd386104399f 100644 (file)
@@ -239,12 +239,52 @@ struct kmemtrace_user_event_alloc {
 };
 
 static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter,
-                          struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
 {
-       struct kmemtrace_user_event_alloc *ev_alloc;
        struct trace_seq *s = &iter->seq;
+       struct kmemtrace_alloc_entry *entry;
+       int ret;
+
+       trace_assign_type(entry, iter->ent);
+
+       ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
+           "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+           entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
+           (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
+           (unsigned long)entry->gfp_flags, entry->node);
+
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free(struct trace_iterator *iter, int flags)
+{
+       struct trace_seq *s = &iter->seq;
+       struct kmemtrace_free_entry *entry;
+       int ret;
+
+       trace_assign_type(entry, iter->ent);
+
+       ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
+                              entry->type_id, (void *)entry->call_site,
+                              (unsigned long)entry->ptr);
+
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+{
+       struct trace_seq *s = &iter->seq;
+       struct kmemtrace_alloc_entry *entry;
        struct kmemtrace_user_event *ev;
+       struct kmemtrace_user_event_alloc *ev_alloc;
+
+       trace_assign_type(entry, iter->ent);
 
        ev = trace_seq_reserve(s, sizeof(*ev));
        if (!ev)
@@ -271,12 +311,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
 }
 
 static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter,
-                         struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
 {
        struct trace_seq *s = &iter->seq;
+       struct kmemtrace_free_entry *entry;
        struct kmemtrace_user_event *ev;
 
+       trace_assign_type(entry, iter->ent);
+
        ev = trace_seq_reserve(s, sizeof(*ev));
        if (!ev)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +336,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
 
 /* The two other following provide a more minimalistic output */
 static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter,
-                                       struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc_compress(struct trace_iterator *iter)
 {
+       struct kmemtrace_alloc_entry *entry;
        struct trace_seq *s = &iter->seq;
        int ret;
 
+       trace_assign_type(entry, iter->ent);
+
        /* Alloc entry */
        ret = trace_seq_printf(s, "  +      ");
        if (!ret)
@@ -345,29 +389,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
-       /* Node */
-       ret = trace_seq_printf(s, "%4d   ", entry->node);
-       if (!ret)
-               return TRACE_TYPE_PARTIAL_LINE;
-
-       /* Call site */
-       ret = seq_print_ip_sym(s, entry->call_site, 0);
+       /* Node and call site*/
+       ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
+                                                (void *)entry->call_site);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
-       if (!trace_seq_printf(s, "\n"))
-               return TRACE_TYPE_PARTIAL_LINE;
-
        return TRACE_TYPE_HANDLED;
 }
 
 static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter,
-                             struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_compress(struct trace_iterator *iter)
 {
+       struct kmemtrace_free_entry *entry;
        struct trace_seq *s = &iter->seq;
        int ret;
 
+       trace_assign_type(entry, iter->ent);
+
        /* Free entry */
        ret = trace_seq_printf(s, "  -      ");
        if (!ret)
@@ -401,19 +440,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
-       /* Skip node */
-       ret = trace_seq_printf(s, "       ");
+       /* Skip node and print call site*/
+       ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
-       /* Call site */
-       ret = seq_print_ip_sym(s, entry->call_site, 0);
-       if (!ret)
-               return TRACE_TYPE_PARTIAL_LINE;
-
-       if (!trace_seq_printf(s, "\n"))
-               return TRACE_TYPE_PARTIAL_LINE;
-
        return TRACE_TYPE_HANDLED;
 }
 
@@ -421,32 +452,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 {
        struct trace_entry *entry = iter->ent;
 
-       switch (entry->type) {
-       case TRACE_KMEM_ALLOC: {
-               struct kmemtrace_alloc_entry *field;
-
-               trace_assign_type(field, entry);
-               if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
-                       return kmemtrace_print_alloc_compress(iter, field);
-               else
-                       return kmemtrace_print_alloc_user(iter, field);
-       }
-
-       case TRACE_KMEM_FREE: {
-               struct kmemtrace_free_entry *field;
-
-               trace_assign_type(field, entry);
-               if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
-                       return kmemtrace_print_free_compress(iter, field);
-               else
-                       return kmemtrace_print_free_user(iter, field);
-       }
+       if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+               return TRACE_TYPE_UNHANDLED;
 
+       switch (entry->type) {
+       case TRACE_KMEM_ALLOC:
+               return kmemtrace_print_alloc_compress(iter);
+       case TRACE_KMEM_FREE:
+               return kmemtrace_print_free_compress(iter);
        default:
                return TRACE_TYPE_UNHANDLED;
        }
 }
 
+static struct trace_event kmem_trace_alloc = {
+       .type                   = TRACE_KMEM_ALLOC,
+       .trace                  = kmemtrace_print_alloc,
+       .binary                 = kmemtrace_print_alloc_user,
+};
+
+static struct trace_event kmem_trace_free = {
+       .type                   = TRACE_KMEM_FREE,
+       .trace                  = kmemtrace_print_free,
+       .binary                 = kmemtrace_print_free_user,
+};
+
 static struct tracer kmem_tracer __read_mostly = {
        .name                   = "kmemtrace",
        .init                   = kmem_trace_init,
@@ -463,6 +493,21 @@ void kmemtrace_init(void)
 
 static int __init init_kmem_tracer(void)
 {
-       return register_tracer(&kmem_tracer);
+       if (!register_ftrace_event(&kmem_trace_alloc)) {
+               pr_warning("Warning: could not register kmem events\n");
+               return 1;
+       }
+
+       if (!register_ftrace_event(&kmem_trace_free)) {
+               pr_warning("Warning: could not register kmem events\n");
+               return 1;
+       }
+
+       if (!register_tracer(&kmem_tracer)) {
+               pr_warning("Warning: could not register the kmem tracer\n");
+               return 1;
+       }
+
+       return 0;
 }
 device_initcall(init_kmem_tracer);
index a330513d96ce321ae0ea50e9fa648d6e9cbfc7e5..da2c59d8f486c11c6e3db8759c15e78805a666f0 100644 (file)
@@ -322,6 +322,14 @@ struct buffer_data_page {
        unsigned char    data[];        /* data of buffer page */
 };
 
+/*
+ * Note, the buffer_page list must be first. The buffer pages
+ * are allocated in cache lines, which means that each buffer
+ * page will be at the beginning of a cache line, and thus
+ * the least significant bits will be zero. We use this to
+ * add flags in the list struct pointers, to make the ring buffer
+ * lockless.
+ */
 struct buffer_page {
        struct list_head list;          /* list of buffer pages */
        local_t          write;         /* index for next write */
@@ -330,6 +338,21 @@ struct buffer_page {
        struct buffer_data_page *page;  /* Actual data page */
 };
 
+/*
+ * The buffer page counters, write and entries, must be reset
+ * atomically when crossing page boundaries. To synchronize this
+ * update, two counters are inserted into the number. One is
+ * the actual counter for the write position or count on the page.
+ *
+ * The other is a counter of updaters. Before an update happens
+ * the update partition of the counter is incremented. This will
+ * allow the updater to update the counter atomically.
+ *
+ * The counter is 20 bits, and the state data is 12.
+ */
+#define RB_WRITE_MASK          0xfffff
+#define RB_WRITE_INTCNT                (1 << 20)
+
 static void rb_init_page(struct buffer_data_page *bpage)
 {
        local_set(&bpage->commit, 0);
@@ -403,21 +426,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 struct ring_buffer_per_cpu {
        int                             cpu;
        struct ring_buffer              *buffer;
-       spinlock_t                      reader_lock; /* serialize readers */
+       spinlock_t                      reader_lock;    /* serialize readers */
        raw_spinlock_t                  lock;
        struct lock_class_key           lock_key;
-       struct list_head                pages;
+       struct list_head                *pages;
        struct buffer_page              *head_page;     /* read from head */
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
-       unsigned long                   nmi_dropped;
-       unsigned long                   commit_overrun;
-       unsigned long                   overrun;
-       unsigned long                   read;
+       local_t                         commit_overrun;
+       local_t                         overrun;
        local_t                         entries;
        local_t                         committing;
        local_t                         commits;
+       unsigned long                   read;
        u64                             write_stamp;
        u64                             read_stamp;
        atomic_t                        record_disabled;
@@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
+/*
+ * Making the ring buffer lockless makes things tricky.
+ * Although writes only happen on the CPU that they are on,
+ * and they only need to worry about interrupts. Reads can
+ * happen on any CPU.
+ *
+ * The reader page is always off the ring buffer, but when the
+ * reader finishes with a page, it needs to swap its page with
+ * a new one from the buffer. The reader needs to take from
+ * the head (writes go to the tail). But if a writer is in overwrite
+ * mode and wraps, it must push the head page forward.
+ *
+ * Here lies the problem.
+ *
+ * The reader must be careful to replace only the head page, and
+ * not another one. As described at the top of the file in the
+ * ASCII art, the reader sets its old page to point to the next
+ * page after head. It then sets the page after head to point to
+ * the old reader page. But if the writer moves the head page
+ * during this operation, the reader could end up with the tail.
+ *
+ * We use cmpxchg to help prevent this race. We also do something
+ * special with the page before head. We set the LSB to 1.
+ *
+ * When the writer must push the page forward, it will clear the
+ * bit that points to the head page, move the head, and then set
+ * the bit that points to the new head page.
+ *
+ * We also don't want an interrupt coming in and moving the head
+ * page on another writer. Thus we use the second LSB to catch
+ * that too. Thus:
+ *
+ * head->list->prev->next        bit 1          bit 0
+ *                              -------        -------
+ * Normal page                     0              0
+ * Points to head page             0              1
+ * New head page                   1              0
+ *
+ * Note we can not trust the prev pointer of the head page, because:
+ *
+ * +----+       +-----+        +-----+
+ * |    |------>|  T  |---X--->|  N  |
+ * |    |<------|     |        |     |
+ * +----+       +-----+        +-----+
+ *   ^                           ^ |
+ *   |          +-----+          | |
+ *   +----------|  R  |----------+ |
+ *              |     |<-----------+
+ *              +-----+
+ *
+ * Key:  ---X-->  HEAD flag set in pointer
+ *         T      Tail page
+ *         R      Reader page
+ *         N      Next page
+ *
+ * (see __rb_reserve_next() to see where this happens)
+ *
+ *  What the above shows is that the reader just swapped out
+ *  the reader page with a page in the buffer, but before it
+ *  could make the new header point back to the new page added
+ *  it was preempted by a writer. The writer moved forward onto
+ *  the new page added by the reader and is about to move forward
+ *  again.
+ *
+ *  You can see, it is legitimate for the previous pointer of
+ *  the head (or any page) not to point back to itself. But only
+ *  temporarially.
+ */
+
+#define RB_PAGE_NORMAL         0UL
+#define RB_PAGE_HEAD           1UL
+#define RB_PAGE_UPDATE         2UL
+
+
+#define RB_FLAG_MASK           3UL
+
+/* PAGE_MOVED is not part of the mask */
+#define RB_PAGE_MOVED          4UL
+
+/*
+ * rb_list_head - remove any bit
+ */
+static struct list_head *rb_list_head(struct list_head *list)
+{
+       unsigned long val = (unsigned long)list;
+
+       return (struct list_head *)(val & ~RB_FLAG_MASK);
+}
+
+/*
+ * rb_is_head_page - test if the give page is the head page
+ *
+ * Because the reader may move the head_page pointer, we can
+ * not trust what the head page is (it may be pointing to
+ * the reader page). But if the next page is a header page,
+ * its flags will be non zero.
+ */
+static int inline
+rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+               struct buffer_page *page, struct list_head *list)
+{
+       unsigned long val;
+
+       val = (unsigned long)list->next;
+
+       if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
+               return RB_PAGE_MOVED;
+
+       return val & RB_FLAG_MASK;
+}
+
+/*
+ * rb_is_reader_page
+ *
+ * The unique thing about the reader page, is that, if the
+ * writer is ever on it, the previous pointer never points
+ * back to the reader page.
+ */
+static int rb_is_reader_page(struct buffer_page *page)
+{
+       struct list_head *list = page->list.prev;
+
+       return rb_list_head(list->next) != &page->list;
+}
+
+/*
+ * rb_set_list_to_head - set a list_head to be pointing to head.
+ */
+static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
+                               struct list_head *list)
+{
+       unsigned long *ptr;
+
+       ptr = (unsigned long *)&list->next;
+       *ptr |= RB_PAGE_HEAD;
+       *ptr &= ~RB_PAGE_UPDATE;
+}
+
+/*
+ * rb_head_page_activate - sets up head page
+ */
+static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       struct buffer_page *head;
+
+       head = cpu_buffer->head_page;
+       if (!head)
+               return;
+
+       /*
+        * Set the previous list pointer to have the HEAD flag.
+        */
+       rb_set_list_to_head(cpu_buffer, head->list.prev);
+}
+
+static void rb_list_head_clear(struct list_head *list)
+{
+       unsigned long *ptr = (unsigned long *)&list->next;
+
+       *ptr &= ~RB_FLAG_MASK;
+}
+
+/*
+ * rb_head_page_dactivate - clears head page ptr (for free list)
+ */
+static void
+rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       struct list_head *hd;
+
+       /* Go through the whole list and clear any pointers found. */
+       rb_list_head_clear(cpu_buffer->pages);
+
+       list_for_each(hd, cpu_buffer->pages)
+               rb_list_head_clear(hd);
+}
+
+static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
+                           struct buffer_page *head,
+                           struct buffer_page *prev,
+                           int old_flag, int new_flag)
+{
+       struct list_head *list;
+       unsigned long val = (unsigned long)&head->list;
+       unsigned long ret;
+
+       list = &prev->list;
+
+       val &= ~RB_FLAG_MASK;
+
+       ret = (unsigned long)cmpxchg(&list->next,
+                                    val | old_flag, val | new_flag);
+
+       /* check if the reader took the page */
+       if ((ret & ~RB_FLAG_MASK) != val)
+               return RB_PAGE_MOVED;
+
+       return ret & RB_FLAG_MASK;
+}
+
+static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
+                                  struct buffer_page *head,
+                                  struct buffer_page *prev,
+                                  int old_flag)
+{
+       return rb_head_page_set(cpu_buffer, head, prev,
+                               old_flag, RB_PAGE_UPDATE);
+}
+
+static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
+                                struct buffer_page *head,
+                                struct buffer_page *prev,
+                                int old_flag)
+{
+       return rb_head_page_set(cpu_buffer, head, prev,
+                               old_flag, RB_PAGE_HEAD);
+}
+
+static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
+                                  struct buffer_page *head,
+                                  struct buffer_page *prev,
+                                  int old_flag)
+{
+       return rb_head_page_set(cpu_buffer, head, prev,
+                               old_flag, RB_PAGE_NORMAL);
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+                              struct buffer_page **bpage)
+{
+       struct list_head *p = rb_list_head((*bpage)->list.next);
+
+       *bpage = list_entry(p, struct buffer_page, list);
+}
+
+static struct buffer_page *
+rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       struct buffer_page *head;
+       struct buffer_page *page;
+       struct list_head *list;
+       int i;
+
+       if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
+               return NULL;
+
+       /* sanity check */
+       list = cpu_buffer->pages;
+       if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
+               return NULL;
+
+       page = head = cpu_buffer->head_page;
+       /*
+        * It is possible that the writer moves the header behind
+        * where we started, and we miss in one loop.
+        * A second loop should grab the header, but we'll do
+        * three loops just because I'm paranoid.
+        */
+       for (i = 0; i < 3; i++) {
+               do {
+                       if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+                               cpu_buffer->head_page = page;
+                               return page;
+                       }
+                       rb_inc_page(cpu_buffer, &page);
+               } while (page != head);
+       }
+
+       RB_WARN_ON(cpu_buffer, 1);
+
+       return NULL;
+}
+
+static int rb_head_page_replace(struct buffer_page *old,
+                               struct buffer_page *new)
+{
+       unsigned long *ptr = (unsigned long *)&old->list.prev->next;
+       unsigned long val;
+       unsigned long ret;
+
+       val = *ptr & ~RB_FLAG_MASK;
+       val |= RB_PAGE_HEAD;
+
+       ret = cmpxchg(ptr, val, &new->list);
+
+       return ret == val;
+}
+
+/*
+ * rb_tail_page_update - move the tail page forward
+ *
+ * Returns 1 if moved tail page, 0 if someone else did.
+ */
+static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+                              struct buffer_page *tail_page,
+                              struct buffer_page *next_page)
+{
+       struct buffer_page *old_tail;
+       unsigned long old_entries;
+       unsigned long old_write;
+       int ret = 0;
+
+       /*
+        * The tail page now needs to be moved forward.
+        *
+        * We need to reset the tail page, but without messing
+        * with possible erasing of data brought in by interrupts
+        * that have moved the tail page and are currently on it.
+        *
+        * We add a counter to the write field to denote this.
+        */
+       old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
+       old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+
+       /*
+        * Just make sure we have seen our old_write and synchronize
+        * with any interrupts that come in.
+        */
+       barrier();
+
+       /*
+        * If the tail page is still the same as what we think
+        * it is, then it is up to us to update the tail
+        * pointer.
+        */
+       if (tail_page == cpu_buffer->tail_page) {
+               /* Zero the write counter */
+               unsigned long val = old_write & ~RB_WRITE_MASK;
+               unsigned long eval = old_entries & ~RB_WRITE_MASK;
+
+               /*
+                * This will only succeed if an interrupt did
+                * not come in and change it. In which case, we
+                * do not want to modify it.
+                *
+                * We add (void) to let the compiler know that we do not care
+                * about the return value of these functions. We use the
+                * cmpxchg to only update if an interrupt did not already
+                * do it for us. If the cmpxchg fails, we don't care.
+                */
+               (void)local_cmpxchg(&next_page->write, old_write, val);
+               (void)local_cmpxchg(&next_page->entries, old_entries, eval);
+
+               /*
+                * No need to worry about races with clearing out the commit.
+                * it only can increment when a commit takes place. But that
+                * only happens in the outer most nested commit.
+                */
+               local_set(&next_page->page->commit, 0);
+
+               old_tail = cmpxchg(&cpu_buffer->tail_page,
+                                  tail_page, next_page);
+
+               if (old_tail == tail_page)
+                       ret = 1;
+       }
+
+       return ret;
+}
+
+static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+                         struct buffer_page *bpage)
+{
+       unsigned long val = (unsigned long)bpage;
+
+       if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
+               return 1;
+
+       return 0;
+}
+
+/**
+ * rb_check_list - make sure a pointer to a list has the last bits zero
+ */
+static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
+                        struct list_head *list)
+{
+       if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
+               return 1;
+       if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
+               return 1;
+       return 0;
+}
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
  */
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 {
-       struct list_head *head = &cpu_buffer->pages;
+       struct list_head *head = cpu_buffer->pages;
        struct buffer_page *bpage, *tmp;
 
+       rb_head_page_deactivate(cpu_buffer);
+
        if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
                return -1;
        if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
                return -1;
 
+       if (rb_check_list(cpu_buffer, head))
+               return -1;
+
        list_for_each_entry_safe(bpage, tmp, head, list) {
                if (RB_WARN_ON(cpu_buffer,
                               bpage->list.next->prev != &bpage->list))
@@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
                if (RB_WARN_ON(cpu_buffer,
                               bpage->list.prev->next != &bpage->list))
                        return -1;
+               if (rb_check_list(cpu_buffer, &bpage->list))
+                       return -1;
        }
 
+       rb_head_page_activate(cpu_buffer);
+
        return 0;
 }
 
 static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                             unsigned nr_pages)
 {
-       struct list_head *head = &cpu_buffer->pages;
        struct buffer_page *bpage, *tmp;
        unsigned long addr;
        LIST_HEAD(pages);
        unsigned i;
 
+       WARN_ON(!nr_pages);
+
        for (i = 0; i < nr_pages; i++) {
                bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                                    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
                if (!bpage)
                        goto free_pages;
+
+               rb_check_bpage(cpu_buffer, bpage);
+
                list_add(&bpage->list, &pages);
 
                addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                rb_init_page(bpage->page);
        }
 
-       list_splice(&pages, head);
+       /*
+        * The ring buffer page list is a circular list that does not
+        * start and end with a list head. All page list items point to
+        * other pages.
+        */
+       cpu_buffer->pages = pages.next;
+       list_del(&pages);
 
        rb_check_pages(cpu_buffer);
 
@@ -573,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-       INIT_LIST_HEAD(&cpu_buffer->pages);
 
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
        if (!bpage)
                goto fail_free_buffer;
 
+       rb_check_bpage(cpu_buffer, bpage);
+
        cpu_buffer->reader_page = bpage;
        addr = __get_free_page(GFP_KERNEL);
        if (!addr)
@@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
                goto fail_free_reader;
 
        cpu_buffer->head_page
-               = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+               = list_entry(cpu_buffer->pages, struct buffer_page, list);
        cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
 
+       rb_head_page_activate(cpu_buffer);
+
        return cpu_buffer;
 
  fail_free_reader:
@@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 {
-       struct list_head *head = &cpu_buffer->pages;
+       struct list_head *head = cpu_buffer->pages;
        struct buffer_page *bpage, *tmp;
 
        free_buffer_page(cpu_buffer->reader_page);
 
-       list_for_each_entry_safe(bpage, tmp, head, list) {
-               list_del_init(&bpage->list);
+       rb_head_page_deactivate(cpu_buffer);
+
+       if (head) {
+               list_for_each_entry_safe(bpage, tmp, head, list) {
+                       list_del_init(&bpage->list);
+                       free_buffer_page(bpage);
+               }
+               bpage = list_entry(head, struct buffer_page, list);
                free_buffer_page(bpage);
        }
+
        kfree(cpu_buffer);
 }
 
@@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        atomic_inc(&cpu_buffer->record_disabled);
        synchronize_sched();
 
+       rb_head_page_deactivate(cpu_buffer);
+
        for (i = 0; i < nr_pages; i++) {
-               if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+               if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
                        return;
-               p = cpu_buffer->pages.next;
+               p = cpu_buffer->pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                free_buffer_page(bpage);
        }
-       if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+       if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
                return;
 
        rb_reset_cpu(cpu_buffer);
@@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        atomic_inc(&cpu_buffer->record_disabled);
        synchronize_sched();
 
+       spin_lock_irq(&cpu_buffer->reader_lock);
+       rb_head_page_deactivate(cpu_buffer);
+
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
                        return;
                p = pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
-               list_add_tail(&bpage->list, &cpu_buffer->pages);
+               list_add_tail(&bpage->list, cpu_buffer->pages);
        }
        rb_reset_cpu(cpu_buffer);
+       spin_unlock_irq(&cpu_buffer->reader_lock);
 
        rb_check_pages(cpu_buffer);
 
@@ -948,22 +1389,15 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
                               cpu_buffer->reader_page->read);
 }
 
-static inline struct ring_buffer_event *
-rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
-{
-       return __rb_page_index(cpu_buffer->head_page,
-                              cpu_buffer->head_page->read);
-}
-
 static inline struct ring_buffer_event *
 rb_iter_head_event(struct ring_buffer_iter *iter)
 {
        return __rb_page_index(iter->head_page, iter->head);
 }
 
-static inline unsigned rb_page_write(struct buffer_page *bpage)
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
 {
-       return local_read(&bpage->write);
+       return local_read(&bpage->write) & RB_WRITE_MASK;
 }
 
 static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
        return local_read(&bpage->page->commit);
 }
 
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+       return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
 /* Size is determined by what has been commited */
 static inline unsigned rb_page_size(struct buffer_page *bpage)
 {
@@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
        return rb_page_commit(cpu_buffer->commit_page);
 }
 
-static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
-{
-       return rb_page_commit(cpu_buffer->head_page);
-}
-
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
-                              struct buffer_page **bpage)
-{
-       struct list_head *p = (*bpage)->list.next;
-
-       if (p == &cpu_buffer->pages)
-               p = p->next;
-
-       *bpage = list_entry(p, struct buffer_page, list);
-}
-
 static inline unsigned
 rb_event_index(struct ring_buffer_event *event)
 {
@@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
+       unsigned long max_count;
+
        /*
         * We only race with interrupts and NMIs on this CPU.
         * If we own the commit event, then we can commit
@@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
         * assign the commit to the tail.
         */
  again:
+       max_count = cpu_buffer->buffer->pages * 100;
+
        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-               cpu_buffer->commit_page->page->commit =
-                       cpu_buffer->commit_page->write;
+               if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+                       return;
+               if (RB_WARN_ON(cpu_buffer,
+                              rb_is_reader_page(cpu_buffer->tail_page)))
+                       return;
+               local_set(&cpu_buffer->commit_page->page->commit,
+                         rb_page_write(cpu_buffer->commit_page));
                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
                cpu_buffer->write_stamp =
                        cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
        }
        while (rb_commit_index(cpu_buffer) !=
               rb_page_write(cpu_buffer->commit_page)) {
-               cpu_buffer->commit_page->page->commit =
-                       cpu_buffer->commit_page->write;
+
+               local_set(&cpu_buffer->commit_page->page->commit,
+                         rb_page_write(cpu_buffer->commit_page));
+               RB_WARN_ON(cpu_buffer,
+                          local_read(&cpu_buffer->commit_page->page->commit) &
+                          ~RB_WRITE_MASK);
                barrier();
        }
 
@@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
         * to the head page instead of next.
         */
        if (iter->head_page == cpu_buffer->reader_page)
-               iter->head_page = cpu_buffer->head_page;
+               iter->head_page = rb_set_head_page(cpu_buffer);
        else
                rb_inc_page(cpu_buffer, &iter->head_page);
 
@@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
        }
 }
 
+/*
+ * rb_handle_head_page - writer hit the head page
+ *
+ * Returns: +1 to retry page
+ *           0 to continue
+ *          -1 on error
+ */
+static int
+rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+                   struct buffer_page *tail_page,
+                   struct buffer_page *next_page)
+{
+       struct buffer_page *new_head;
+       int entries;
+       int type;
+       int ret;
+
+       entries = rb_page_entries(next_page);
+
+       /*
+        * The hard part is here. We need to move the head
+        * forward, and protect against both readers on
+        * other CPUs and writers coming in via interrupts.
+        */
+       type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
+                                      RB_PAGE_HEAD);
+
+       /*
+        * type can be one of four:
+        *  NORMAL - an interrupt already moved it for us
+        *  HEAD   - we are the first to get here.
+        *  UPDATE - we are the interrupt interrupting
+        *           a current move.
+        *  MOVED  - a reader on another CPU moved the next
+        *           pointer to its reader page. Give up
+        *           and try again.
+        */
+
+       switch (type) {
+       case RB_PAGE_HEAD:
+               /*
+                * We changed the head to UPDATE, thus
+                * it is our responsibility to update
+                * the counters.
+                */
+               local_add(entries, &cpu_buffer->overrun);
+
+               /*
+                * The entries will be zeroed out when we move the
+                * tail page.
+                */
+
+               /* still more to do */
+               break;
+
+       case RB_PAGE_UPDATE:
+               /*
+                * This is an interrupt that interrupt the
+                * previous update. Still more to do.
+                */
+               break;
+       case RB_PAGE_NORMAL:
+               /*
+                * An interrupt came in before the update
+                * and processed this for us.
+                * Nothing left to do.
+                */
+               return 1;
+       case RB_PAGE_MOVED:
+               /*
+                * The reader is on another CPU and just did
+                * a swap with our next_page.
+                * Try again.
+                */
+               return 1;
+       default:
+               RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
+               return -1;
+       }
+
+       /*
+        * Now that we are here, the old head pointer is
+        * set to UPDATE. This will keep the reader from
+        * swapping the head page with the reader page.
+        * The reader (on another CPU) will spin till
+        * we are finished.
+        *
+        * We just need to protect against interrupts
+        * doing the job. We will set the next pointer
+        * to HEAD. After that, we set the old pointer
+        * to NORMAL, but only if it was HEAD before.
+        * otherwise we are an interrupt, and only
+        * want the outer most commit to reset it.
+        */
+       new_head = next_page;
+       rb_inc_page(cpu_buffer, &new_head);
+
+       ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
+                                   RB_PAGE_NORMAL);
+
+       /*
+        * Valid returns are:
+        *  HEAD   - an interrupt came in and already set it.
+        *  NORMAL - One of two things:
+        *            1) We really set it.
+        *            2) A bunch of interrupts came in and moved
+        *               the page forward again.
+        */
+       switch (ret) {
+       case RB_PAGE_HEAD:
+       case RB_PAGE_NORMAL:
+               /* OK */
+               break;
+       default:
+               RB_WARN_ON(cpu_buffer, 1);
+               return -1;
+       }
+
+       /*
+        * It is possible that an interrupt came in,
+        * set the head up, then more interrupts came in
+        * and moved it again. When we get back here,
+        * the page would have been set to NORMAL but we
+        * just set it back to HEAD.
+        *
+        * How do you detect this? Well, if that happened
+        * the tail page would have moved.
+        */
+       if (ret == RB_PAGE_NORMAL) {
+               /*
+                * If the tail had moved passed next, then we need
+                * to reset the pointer.
+                */
+               if (cpu_buffer->tail_page != tail_page &&
+                   cpu_buffer->tail_page != next_page)
+                       rb_head_page_set_normal(cpu_buffer, new_head,
+                                               next_page,
+                                               RB_PAGE_HEAD);
+       }
+
+       /*
+        * If this was the outer most commit (the one that
+        * changed the original pointer from HEAD to UPDATE),
+        * then it is up to us to reset it to NORMAL.
+        */
+       if (type == RB_PAGE_HEAD) {
+               ret = rb_head_page_set_normal(cpu_buffer, next_page,
+                                             tail_page,
+                                             RB_PAGE_UPDATE);
+               if (RB_WARN_ON(cpu_buffer,
+                              ret != RB_PAGE_UPDATE))
+                       return -1;
+       }
+
+       return 0;
+}
+
 static unsigned rb_calculate_event_length(unsigned length)
 {
        struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1200,96 +1793,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             struct buffer_page *commit_page,
             struct buffer_page *tail_page, u64 *ts)
 {
-       struct buffer_page *next_page, *head_page, *reader_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
-       bool lock_taken = false;
-       unsigned long flags;
+       struct buffer_page *next_page;
+       int ret;
 
        next_page = tail_page;
 
-       local_irq_save(flags);
-       /*
-        * Since the write to the buffer is still not
-        * fully lockless, we must be careful with NMIs.
-        * The locks in the writers are taken when a write
-        * crosses to a new page. The locks protect against
-        * races with the readers (this will soon be fixed
-        * with a lockless solution).
-        *
-        * Because we can not protect against NMIs, and we
-        * want to keep traces reentrant, we need to manage
-        * what happens when we are in an NMI.
-        *
-        * NMIs can happen after we take the lock.
-        * If we are in an NMI, only take the lock
-        * if it is not already taken. Otherwise
-        * simply fail.
-        */
-       if (unlikely(in_nmi())) {
-               if (!__raw_spin_trylock(&cpu_buffer->lock)) {
-                       cpu_buffer->nmi_dropped++;
-                       goto out_reset;
-               }
-       } else
-               __raw_spin_lock(&cpu_buffer->lock);
-
-       lock_taken = true;
-
        rb_inc_page(cpu_buffer, &next_page);
 
-       head_page = cpu_buffer->head_page;
-       reader_page = cpu_buffer->reader_page;
-
-       /* we grabbed the lock before incrementing */
-       if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-               goto out_reset;
-
        /*
         * If for some reason, we had an interrupt storm that made
         * it all the way around the buffer, bail, and warn
         * about it.
         */
        if (unlikely(next_page == commit_page)) {
-               cpu_buffer->commit_overrun++;
+               local_inc(&cpu_buffer->commit_overrun);
                goto out_reset;
        }
 
-       if (next_page == head_page) {
-               if (!(buffer->flags & RB_FL_OVERWRITE))
-                       goto out_reset;
-
-               /* tail_page has not moved yet? */
-               if (tail_page == cpu_buffer->tail_page) {
-                       /* count overflows */
-                       cpu_buffer->overrun +=
-                               local_read(&head_page->entries);
+       /*
+        * This is where the fun begins!
+        *
+        * We are fighting against races between a reader that
+        * could be on another CPU trying to swap its reader
+        * page with the buffer head.
+        *
+        * We are also fighting against interrupts coming in and
+        * moving the head or tail on us as well.
+        *
+        * If the next page is the head page then we have filled
+        * the buffer, unless the commit page is still on the
+        * reader page.
+        */
+       if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
 
-                       rb_inc_page(cpu_buffer, &head_page);
-                       cpu_buffer->head_page = head_page;
-                       cpu_buffer->head_page->read = 0;
+               /*
+                * If the commit is not on the reader page, then
+                * move the header page.
+                */
+               if (!rb_is_reader_page(cpu_buffer->commit_page)) {
+                       /*
+                        * If we are not in overwrite mode,
+                        * this is easy, just stop here.
+                        */
+                       if (!(buffer->flags & RB_FL_OVERWRITE))
+                               goto out_reset;
+
+                       ret = rb_handle_head_page(cpu_buffer,
+                                                 tail_page,
+                                                 next_page);
+                       if (ret < 0)
+                               goto out_reset;
+                       if (ret)
+                               goto out_again;
+               } else {
+                       /*
+                        * We need to be careful here too. The
+                        * commit page could still be on the reader
+                        * page. We could have a small buffer, and
+                        * have filled up the buffer with events
+                        * from interrupts and such, and wrapped.
+                        *
+                        * Note, if the tail page is also the on the
+                        * reader_page, we let it move out.
+                        */
+                       if (unlikely((cpu_buffer->commit_page !=
+                                     cpu_buffer->tail_page) &&
+                                    (cpu_buffer->commit_page ==
+                                     cpu_buffer->reader_page))) {
+                               local_inc(&cpu_buffer->commit_overrun);
+                               goto out_reset;
+                       }
                }
        }
 
-       /*
-        * If the tail page is still the same as what we think
-        * it is, then it is up to us to update the tail
-        * pointer.
-        */
-       if (tail_page == cpu_buffer->tail_page) {
-               local_set(&next_page->write, 0);
-               local_set(&next_page->entries, 0);
-               local_set(&next_page->page->commit, 0);
-               cpu_buffer->tail_page = next_page;
-
-               /* reread the time stamp */
+       ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
+       if (ret) {
+               /*
+                * Nested commits always have zero deltas, so
+                * just reread the time stamp
+                */
                *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
-               cpu_buffer->tail_page->page->time_stamp = *ts;
+               next_page->page->time_stamp = *ts;
        }
 
-       rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ out_again:
 
-       __raw_spin_unlock(&cpu_buffer->lock);
-       local_irq_restore(flags);
+       rb_reset_tail(cpu_buffer, tail_page, tail, length);
 
        /* fail and let the caller try again */
        return ERR_PTR(-EAGAIN);
@@ -1298,9 +1888,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
        /* reset write */
        rb_reset_tail(cpu_buffer, tail_page, tail, length);
 
-       if (likely(lock_taken))
-               __raw_spin_unlock(&cpu_buffer->lock);
-       local_irq_restore(flags);
        return NULL;
 }
 
@@ -1317,6 +1904,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        barrier();
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
+
+       /* set write to only the index of the write */
+       write &= RB_WRITE_MASK;
        tail = write - length;
 
        /* See if we shot pass the end of this buffer page */
@@ -1361,12 +1951,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        bpage = cpu_buffer->tail_page;
 
        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+               unsigned long write_mask =
+                       local_read(&bpage->write) & ~RB_WRITE_MASK;
                /*
                 * This is on the tail page. It is possible that
                 * a write could come in and move the tail page
                 * and write to the next page. That is fine
                 * because we just shorten what is on this page.
                 */
+               old_index += write_mask;
+               new_index += write_mask;
                index = local_cmpxchg(&bpage->write, old_index, new_index);
                if (index == old_index)
                        return 1;
@@ -1875,9 +2469,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
 static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct buffer_page *reader = cpu_buffer->reader_page;
-       struct buffer_page *head = cpu_buffer->head_page;
+       struct buffer_page *head = rb_set_head_page(cpu_buffer);
        struct buffer_page *commit = cpu_buffer->commit_page;
 
+       /* In case of error, head will be NULL */
+       if (unlikely(!head))
+               return 1;
+
        return reader->read == rb_page_commit(reader) &&
                (commit == reader ||
                 (commit == head &&
@@ -1968,7 +2566,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
                return 0;
 
        cpu_buffer = buffer->buffers[cpu];
-       ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+       ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
                - cpu_buffer->read;
 
        return ret;
@@ -1989,32 +2587,12 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
                return 0;
 
        cpu_buffer = buffer->buffers[cpu];
-       ret = cpu_buffer->overrun;
+       ret = local_read(&cpu_buffer->overrun);
 
        return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
-/**
- * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
- * @buffer: The ring buffer
- * @cpu: The per CPU buffer to get the number of overruns from
- */
-unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
-{
-       struct ring_buffer_per_cpu *cpu_buffer;
-       unsigned long ret;
-
-       if (!cpumask_test_cpu(cpu, buffer->cpumask))
-               return 0;
-
-       cpu_buffer = buffer->buffers[cpu];
-       ret = cpu_buffer->nmi_dropped;
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
-
 /**
  * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
  * @buffer: The ring buffer
@@ -2030,7 +2608,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
                return 0;
 
        cpu_buffer = buffer->buffers[cpu];
-       ret = cpu_buffer->commit_overrun;
+       ret = local_read(&cpu_buffer->commit_overrun);
 
        return ret;
 }
@@ -2053,7 +2631,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
                entries += (local_read(&cpu_buffer->entries) -
-                           cpu_buffer->overrun) - cpu_buffer->read;
+                           local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
        }
 
        return entries;
@@ -2076,7 +2654,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-               overruns += cpu_buffer->overrun;
+               overruns += local_read(&cpu_buffer->overrun);
        }
 
        return overruns;
@@ -2089,8 +2667,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 
        /* Iterator usage is expected to have record disabled */
        if (list_empty(&cpu_buffer->reader_page->list)) {
-               iter->head_page = cpu_buffer->head_page;
-               iter->head = cpu_buffer->head_page->read;
+               iter->head_page = rb_set_head_page(cpu_buffer);
+               if (unlikely(!iter->head_page))
+                       return;
+               iter->head = iter->head_page->read;
        } else {
                iter->head_page = cpu_buffer->reader_page;
                iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2787,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        struct buffer_page *reader = NULL;
        unsigned long flags;
        int nr_loops = 0;
+       int ret;
 
        local_irq_save(flags);
        __raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2821,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
                goto out;
 
        /*
-        * Splice the empty reader page into the list around the head.
         * Reset the reader page to size zero.
         */
+       local_set(&cpu_buffer->reader_page->write, 0);
+       local_set(&cpu_buffer->reader_page->entries, 0);
+       local_set(&cpu_buffer->reader_page->page->commit, 0);
 
-       reader = cpu_buffer->head_page;
+ spin:
+       /*
+        * Splice the empty reader page into the list around the head.
+        */
+       reader = rb_set_head_page(cpu_buffer);
        cpu_buffer->reader_page->list.next = reader->list.next;
        cpu_buffer->reader_page->list.prev = reader->list.prev;
 
-       local_set(&cpu_buffer->reader_page->write, 0);
-       local_set(&cpu_buffer->reader_page->entries, 0);
-       local_set(&cpu_buffer->reader_page->page->commit, 0);
+       /*
+        * cpu_buffer->pages just needs to point to the buffer, it
+        *  has no specific buffer page to point to. Lets move it out
+        *  of our way so we don't accidently swap it.
+        */
+       cpu_buffer->pages = reader->list.prev;
 
-       /* Make the reader page now replace the head */
-       reader->list.prev->next = &cpu_buffer->reader_page->list;
-       reader->list.next->prev = &cpu_buffer->reader_page->list;
+       /* The reader page will be pointing to the new head */
+       rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+
+       /*
+        * Here's the tricky part.
+        *
+        * We need to move the pointer past the header page.
+        * But we can only do that if a writer is not currently
+        * moving it. The page before the header page has the
+        * flag bit '1' set if it is pointing to the page we want.
+        * but if the writer is in the process of moving it
+        * than it will be '2' or already moved '0'.
+        */
+
+       ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
 
        /*
-        * If the tail is on the reader, then we must set the head
-        * to the inserted page, otherwise we set it one before.
+        * If we did not convert it, then we must try again.
         */
-       cpu_buffer->head_page = cpu_buffer->reader_page;
+       if (!ret)
+               goto spin;
 
-       if (cpu_buffer->commit_page != reader)
-               rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+       /*
+        * Yeah! We succeeded in replacing the page.
+        *
+        * Now make the new head point back to the reader page.
+        */
+       reader->list.next->prev = &cpu_buffer->reader_page->list;
+       rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
 
        /* Finally update the reader page to the new head */
        cpu_buffer->reader_page = reader;
@@ -2717,8 +3324,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
 static void
 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 {
+       rb_head_page_deactivate(cpu_buffer);
+
        cpu_buffer->head_page
-               = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+               = list_entry(cpu_buffer->pages, struct buffer_page, list);
        local_set(&cpu_buffer->head_page->write, 0);
        local_set(&cpu_buffer->head_page->entries, 0);
        local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3343,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        local_set(&cpu_buffer->reader_page->page->commit, 0);
        cpu_buffer->reader_page->read = 0;
 
-       cpu_buffer->nmi_dropped = 0;
-       cpu_buffer->commit_overrun = 0;
-       cpu_buffer->overrun = 0;
-       cpu_buffer->read = 0;
+       local_set(&cpu_buffer->commit_overrun, 0);
+       local_set(&cpu_buffer->overrun, 0);
        local_set(&cpu_buffer->entries, 0);
        local_set(&cpu_buffer->committing, 0);
        local_set(&cpu_buffer->commits, 0);
+       cpu_buffer->read = 0;
 
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
+
+       rb_head_page_activate(cpu_buffer);
 }
 
 /**
@@ -3091,7 +3701,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                read = 0;
        } else {
                /* update the entry counter */
-               cpu_buffer->read += local_read(&reader->entries);
+               cpu_buffer->read += rb_page_entries(reader);
 
                /* swap the pages */
                rb_init_page(bpage);
index c22b40f8f576c19e7c00dec5e8f1983667e7b084..e793cda91dd35dd0926b1573f557092ae3db1182 100644 (file)
@@ -50,7 +50,7 @@ unsigned long __read_mostly   tracing_thresh;
  * On boot up, the ring buffer is set to the minimum size, so that
  * we do not waste memory on systems that are not using tracing.
  */
-static int ring_buffer_expanded;
+int ring_buffer_expanded;
 
 /*
  * We need to change this state when a selftest is running.
@@ -64,7 +64,7 @@ static bool __read_mostly tracing_selftest_running;
 /*
  * If a tracer is running, we do not want to run SELFTEST.
  */
-static bool __read_mostly tracing_selftest_disabled;
+bool __read_mostly tracing_selftest_disabled;
 
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +89,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
  */
 static int tracing_disabled = 1;
 
-static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
 static inline void ftrace_disable_cpu(void)
 {
@@ -867,10 +867,6 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
 
        return event;
 }
-static void ftrace_trace_stack(struct trace_array *tr,
-                              unsigned long flags, int skip, int pc);
-static void ftrace_trace_userstack(struct trace_array *tr,
-                                  unsigned long flags, int pc);
 
 static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
                                        struct ring_buffer_event *event,
@@ -947,54 +943,6 @@ trace_function(struct trace_array *tr,
                ring_buffer_unlock_commit(tr->buffer, event);
 }
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int __trace_graph_entry(struct trace_array *tr,
-                               struct ftrace_graph_ent *trace,
-                               unsigned long flags,
-                               int pc)
-{
-       struct ftrace_event_call *call = &event_funcgraph_entry;
-       struct ring_buffer_event *event;
-       struct ftrace_graph_ent_entry *entry;
-
-       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
-               return 0;
-
-       event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
-                                         sizeof(*entry), flags, pc);
-       if (!event)
-               return 0;
-       entry   = ring_buffer_event_data(event);
-       entry->graph_ent                        = *trace;
-       if (!filter_current_check_discard(call, entry, event))
-               ring_buffer_unlock_commit(global_trace.buffer, event);
-
-       return 1;
-}
-
-static void __trace_graph_return(struct trace_array *tr,
-                               struct ftrace_graph_ret *trace,
-                               unsigned long flags,
-                               int pc)
-{
-       struct ftrace_event_call *call = &event_funcgraph_exit;
-       struct ring_buffer_event *event;
-       struct ftrace_graph_ret_entry *entry;
-
-       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
-               return;
-
-       event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
-                                         sizeof(*entry), flags, pc);
-       if (!event)
-               return;
-       entry   = ring_buffer_event_data(event);
-       entry->ret                              = *trace;
-       if (!filter_current_check_discard(call, entry, event))
-               ring_buffer_unlock_commit(global_trace.buffer, event);
-}
-#endif
-
 void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,11 +952,11 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
                trace_function(tr, ip, parent_ip, flags, pc);
 }
 
+#ifdef CONFIG_STACKTRACE
 static void __ftrace_trace_stack(struct trace_array *tr,
                                 unsigned long flags,
                                 int skip, int pc)
 {
-#ifdef CONFIG_STACKTRACE
        struct ftrace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        struct stack_entry *entry;
@@ -1029,12 +977,10 @@ static void __ftrace_trace_stack(struct trace_array *tr,
        save_stack_trace(&trace);
        if (!filter_check_discard(call, entry, tr->buffer, event))
                ring_buffer_unlock_commit(tr->buffer, event);
-#endif
 }
 
-static void ftrace_trace_stack(struct trace_array *tr,
-                              unsigned long flags,
-                              int skip, int pc)
+void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+                       int pc)
 {
        if (!(trace_flags & TRACE_ITER_STACKTRACE))
                return;
@@ -1042,17 +988,14 @@ static void ftrace_trace_stack(struct trace_array *tr,
        __ftrace_trace_stack(tr, flags, skip, pc);
 }
 
-void __trace_stack(struct trace_array *tr,
-                  unsigned long flags,
-                  int skip, int pc)
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+                  int pc)
 {
        __ftrace_trace_stack(tr, flags, skip, pc);
 }
 
-static void ftrace_trace_userstack(struct trace_array *tr,
-                                  unsigned long flags, int pc)
+void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc)
 {
-#ifdef CONFIG_STACKTRACE
        struct ftrace_event_call *call = &event_user_stack;
        struct ring_buffer_event *event;
        struct userstack_entry *entry;
@@ -1077,7 +1020,6 @@ static void ftrace_trace_userstack(struct trace_array *tr,
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, tr->buffer, event))
                ring_buffer_unlock_commit(tr->buffer, event);
-#endif
 }
 
 #ifdef UNUSED
@@ -1087,6 +1029,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 }
 #endif /* UNUSED */
 
+#endif /* CONFIG_STACKTRACE */
+
 static void
 ftrace_trace_special(void *__tr,
                     unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -1114,62 +1058,6 @@ __trace_special(void *__tr, void *__data,
        ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
 }
 
-void
-tracing_sched_switch_trace(struct trace_array *tr,
-                          struct task_struct *prev,
-                          struct task_struct *next,
-                          unsigned long flags, int pc)
-{
-       struct ftrace_event_call *call = &event_context_switch;
-       struct ring_buffer_event *event;
-       struct ctx_switch_entry *entry;
-
-       event = trace_buffer_lock_reserve(tr, TRACE_CTX,
-                                         sizeof(*entry), flags, pc);
-       if (!event)
-               return;
-       entry   = ring_buffer_event_data(event);
-       entry->prev_pid                 = prev->pid;
-       entry->prev_prio                = prev->prio;
-       entry->prev_state               = prev->state;
-       entry->next_pid                 = next->pid;
-       entry->next_prio                = next->prio;
-       entry->next_state               = next->state;
-       entry->next_cpu = task_cpu(next);
-
-       if (!filter_check_discard(call, entry, tr->buffer, event))
-               trace_buffer_unlock_commit(tr, event, flags, pc);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
-                          struct task_struct *wakee,
-                          struct task_struct *curr,
-                          unsigned long flags, int pc)
-{
-       struct ftrace_event_call *call = &event_wakeup;
-       struct ring_buffer_event *event;
-       struct ctx_switch_entry *entry;
-
-       event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
-                                         sizeof(*entry), flags, pc);
-       if (!event)
-               return;
-       entry   = ring_buffer_event_data(event);
-       entry->prev_pid                 = curr->pid;
-       entry->prev_prio                = curr->prio;
-       entry->prev_state               = curr->state;
-       entry->next_pid                 = wakee->pid;
-       entry->next_prio                = wakee->prio;
-       entry->next_state               = wakee->state;
-       entry->next_cpu                 = task_cpu(wakee);
-
-       if (!filter_check_discard(call, entry, tr->buffer, event))
-               ring_buffer_unlock_commit(tr->buffer, event);
-       ftrace_trace_stack(tr, flags, 6, pc);
-       ftrace_trace_userstack(tr, flags, pc);
-}
-
 void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
@@ -1194,68 +1082,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
        local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-int trace_graph_entry(struct ftrace_graph_ent *trace)
-{
-       struct trace_array *tr = &global_trace;
-       struct trace_array_cpu *data;
-       unsigned long flags;
-       long disabled;
-       int ret;
-       int cpu;
-       int pc;
-
-       if (!ftrace_trace_task(current))
-               return 0;
-
-       if (!ftrace_graph_addr(trace->func))
-               return 0;
-
-       local_irq_save(flags);
-       cpu = raw_smp_processor_id();
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
-       if (likely(disabled == 1)) {
-               pc = preempt_count();
-               ret = __trace_graph_entry(tr, trace, flags, pc);
-       } else {
-               ret = 0;
-       }
-       /* Only do the atomic if it is not already set */
-       if (!test_tsk_trace_graph(current))
-               set_tsk_trace_graph(current);
-
-       atomic_dec(&data->disabled);
-       local_irq_restore(flags);
-
-       return ret;
-}
-
-void trace_graph_return(struct ftrace_graph_ret *trace)
-{
-       struct trace_array *tr = &global_trace;
-       struct trace_array_cpu *data;
-       unsigned long flags;
-       long disabled;
-       int cpu;
-       int pc;
-
-       local_irq_save(flags);
-       cpu = raw_smp_processor_id();
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
-       if (likely(disabled == 1)) {
-               pc = preempt_count();
-               __trace_graph_return(tr, trace, flags, pc);
-       }
-       if (!trace->depth)
-               clear_tsk_trace_graph(current);
-       atomic_dec(&data->disabled);
-       local_irq_restore(flags);
-}
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-
 /**
  * trace_vbprintk - write binary msg to tracing buffer
  *
@@ -2257,8 +2083,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
                len += 3; /* "no" and newline */
        }
 
-       /* +2 for \n and \0 */
-       buf = kmalloc(len + 2, GFP_KERNEL);
+       /* +1 for \0 */
+       buf = kmalloc(len + 1, GFP_KERNEL);
        if (!buf) {
                mutex_unlock(&trace_types_lock);
                return -ENOMEM;
@@ -2281,7 +2107,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
        }
        mutex_unlock(&trace_types_lock);
 
-       WARN_ON(r >= len + 2);
+       WARN_ON(r >= len + 1);
 
        r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 
@@ -3633,9 +3459,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);
 
-       cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
-       trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
-
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
 
        kfree(s);
@@ -4273,7 +4096,6 @@ void ftrace_dump(void)
 
 __init static int tracer_alloc_buffers(void)
 {
-       struct trace_array_cpu *data;
        int ring_buf_size;
        int i;
        int ret = -ENOMEM;
@@ -4323,7 +4145,7 @@ __init static int tracer_alloc_buffers(void)
 
        /* Allocate the first page for all buffers */
        for_each_tracing_cpu(i) {
-               data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+               global_trace.data[i] = &per_cpu(global_trace_cpu, i);
                max_tr.data[i] = &per_cpu(max_data, i);
        }
 
index 8b9f4f6e9559a6d152e874f24f75dae235638fc9..d682357e4b1fda9f45cf6e128d41c57e6bcc4c71 100644 (file)
@@ -467,6 +467,7 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
+void set_graph_array(struct trace_array *tr);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -485,9 +486,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
                          struct task_struct *tsk, int cpu);
 
-void __trace_stack(struct trace_array *tr,
-                  unsigned long flags,
-                  int skip, int pc);
+#ifdef CONFIG_STACKTRACE
+void ftrace_trace_stack(struct trace_array *tr, unsigned long flags,
+                       int skip, int pc);
+
+void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags,
+                           int pc);
+
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+                  int pc);
+#else
+static inline void ftrace_trace_stack(struct trace_array *tr,
+                                     unsigned long flags, int skip, int pc)
+{
+}
+
+static inline void ftrace_trace_userstack(struct trace_array *tr,
+                                         unsigned long flags, int pc)
+{
+}
+
+static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
+                                int skip, int pc)
+{
+}
+#endif /* CONFIG_STACKTRACE */
 
 extern cycle_t ftrace_now(int cpu);
 
@@ -513,6 +536,10 @@ extern unsigned long ftrace_update_tot_cnt;
 extern int DYN_FTRACE_TEST_NAME(void);
 #endif
 
+extern int ring_buffer_expanded;
+extern bool tracing_selftest_disabled;
+DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
                                           struct trace_array *tr);
@@ -743,13 +770,15 @@ struct event_filter {
        int                     n_preds;
        struct filter_pred      **preds;
        char                    *filter_string;
+       bool                    no_reset;
 };
 
 struct event_subsystem {
        struct list_head        list;
        const char              *name;
        struct dentry           *entry;
-       void                    *filter;
+       struct event_filter     *filter;
+       int                     nr_events;
 };
 
 struct filter_pred;
index e75276a49cf5cfa2fb968609602621fa8e5a5da3..e0cbede9678317a9936eea9859e27f4f09dfb783 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/ctype.h>
 #include <linux/delay.h>
 
+#include <asm/setup.h>
+
 #include "trace_output.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
@@ -849,8 +851,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
        /* First see if we did not already create this dir */
        list_for_each_entry(system, &event_subsystems, list) {
-               if (strcmp(system->name, name) == 0)
+               if (strcmp(system->name, name) == 0) {
+                       system->nr_events++;
                        return system->entry;
+               }
        }
 
        /* need to create new entry */
@@ -869,6 +873,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                return d_events;
        }
 
+       system->nr_events = 1;
        system->name = kstrdup(name, GFP_KERNEL);
        if (!system->name) {
                debugfs_remove(system->entry);
@@ -987,6 +992,32 @@ struct ftrace_module_file_ops {
        struct file_operations          filter;
 };
 
+static void remove_subsystem_dir(const char *name)
+{
+       struct event_subsystem *system;
+
+       if (strcmp(name, TRACE_SYSTEM) == 0)
+               return;
+
+       list_for_each_entry(system, &event_subsystems, list) {
+               if (strcmp(system->name, name) == 0) {
+                       if (!--system->nr_events) {
+                               struct event_filter *filter = system->filter;
+
+                               debugfs_remove_recursive(system->entry);
+                               list_del(&system->list);
+                               if (filter) {
+                                       kfree(filter->filter_string);
+                                       kfree(filter);
+                               }
+                               kfree(system->name);
+                               kfree(system);
+                       }
+                       break;
+               }
+       }
+}
+
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1077,6 +1108,7 @@ static void trace_module_remove_events(struct module *mod)
                        list_del(&call->list);
                        trace_destroy_fields(call);
                        destroy_preds(call);
+                       remove_subsystem_dir(call->system);
                }
        }
 
@@ -1133,6 +1165,18 @@ struct notifier_block trace_module_nb = {
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
+
+static __init int setup_trace_event(char *str)
+{
+       strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+       ring_buffer_expanded = 1;
+       tracing_selftest_disabled = 1;
+
+       return 1;
+}
+__setup("trace_event=", setup_trace_event);
+
 static __init int event_trace_init(void)
 {
        struct ftrace_event_call *call;
@@ -1140,6 +1184,8 @@ static __init int event_trace_init(void)
        struct dentry *entry;
        struct dentry *d_events;
        int ret;
+       char *buf = bootup_event_buf;
+       char *token;
 
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
@@ -1185,6 +1231,19 @@ static __init int event_trace_init(void)
                                 &ftrace_event_format_fops);
        }
 
+       while (true) {
+               token = strsep(&buf, ",");
+
+               if (!token)
+                       break;
+               if (!*token)
+                       continue;
+
+               ret = ftrace_set_clr_event(token, 1);
+               if (ret)
+                       pr_warning("Failed to enable trace event: %s\n", token);
+       }
+
        ret = register_module_notifier(&trace_module_nb);
        if (ret)
                pr_warning("Failed to register trace events module notifier\n");
@@ -1392,10 +1451,10 @@ static __init void event_trace_self_test_with_function(void)
 
 static __init int event_trace_self_tests_init(void)
 {
-
-       event_trace_self_tests();
-
-       event_trace_self_test_with_function();
+       if (!tracing_selftest_disabled) {
+               event_trace_self_tests();
+               event_trace_self_test_with_function();
+       }
 
        return 0;
 }
index f32dc9d1ea7b51bb5c469b7778fc198079f5c499..490337abed7592b563331c4a36d440c23de2338b 100644 (file)
@@ -176,11 +176,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 static int filter_pred_strloc(struct filter_pred *pred, void *event,
                              int val1, int val2)
 {
-       unsigned short str_loc = *(unsigned short *)(event + pred->offset);
+       u32 str_item = *(u32 *)(event + pred->offset);
+       int str_loc = str_item & 0xffff;
+       int str_len = str_item >> 16;
        char *addr = (char *)(event + str_loc);
        int cmp, match;
 
-       cmp = strncmp(addr, pred->str_val, pred->str_len);
+       cmp = strncmp(addr, pred->str_val, str_len);
 
        match = (!cmp) ^ pred->not;
 
@@ -418,24 +420,29 @@ oom:
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+enum {
+       FILTER_DISABLE_ALL,
+       FILTER_INIT_NO_RESET,
+       FILTER_SKIP_NO_RESET,
+};
+
+static void filter_free_subsystem_preds(struct event_subsystem *system,
+                                       int flag)
 {
-       struct event_filter *filter = system->filter;
        struct ftrace_event_call *call;
-       int i;
-
-       if (filter->n_preds) {
-               for (i = 0; i < filter->n_preds; i++)
-                       filter_free_pred(filter->preds[i]);
-               kfree(filter->preds);
-               filter->preds = NULL;
-               filter->n_preds = 0;
-       }
 
        list_for_each_entry(call, &ftrace_events, list) {
                if (!call->define_fields)
                        continue;
 
+               if (flag == FILTER_INIT_NO_RESET) {
+                       call->filter->no_reset = false;
+                       continue;
+               }
+
+               if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
+                       continue;
+
                if (!strcmp(call->system, system->name)) {
                        filter_disable_preds(call);
                        remove_filter_string(call->filter);
@@ -537,7 +544,8 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 
 static int filter_add_pred(struct filter_parse_state *ps,
                           struct ftrace_event_call *call,
-                          struct filter_pred *pred)
+                          struct filter_pred *pred,
+                          bool dry_run)
 {
        struct ftrace_event_field *field;
        filter_pred_fn_t fn;
@@ -549,10 +557,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
        if (pred->op == OP_AND) {
                pred->pop_n = 2;
-               return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+               fn = filter_pred_and;
+               goto add_pred_fn;
        } else if (pred->op == OP_OR) {
                pred->pop_n = 2;
-               return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+               fn = filter_pred_or;
+               goto add_pred_fn;
        }
 
        field = find_event_field(call, pred->field_name);
@@ -575,9 +585,6 @@ static int filter_add_pred(struct filter_parse_state *ps,
                else
                        fn = filter_pred_strloc;
                pred->str_len = field->size;
-               if (pred->op == OP_NE)
-                       pred->not = 1;
-               return filter_add_pred_fn(ps, call, pred, fn);
        } else {
                if (field->is_signed)
                        ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +595,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
                        return -EINVAL;
                }
                pred->val = val;
-       }
 
-       fn = select_comparison_fn(pred->op, field->size, field->is_signed);
-       if (!fn) {
-               parse_error(ps, FILT_ERR_INVALID_OP, 0);
-               return -EINVAL;
+               fn = select_comparison_fn(pred->op, field->size,
+                                         field->is_signed);
+               if (!fn) {
+                       parse_error(ps, FILT_ERR_INVALID_OP, 0);
+                       return -EINVAL;
+               }
        }
 
        if (pred->op == OP_NE)
                pred->not = 1;
 
-       return filter_add_pred_fn(ps, call, pred, fn);
+add_pred_fn:
+       if (!dry_run)
+               return filter_add_pred_fn(ps, call, pred, fn);
+       return 0;
 }
 
 static int filter_add_subsystem_pred(struct filter_parse_state *ps,
                                     struct event_subsystem *system,
                                     struct filter_pred *pred,
-                                    char *filter_string)
+                                    char *filter_string,
+                                    bool dry_run)
 {
-       struct event_filter *filter = system->filter;
        struct ftrace_event_call *call;
        int err = 0;
-
-       if (!filter->preds) {
-               filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-                                       GFP_KERNEL);
-
-               if (!filter->preds)
-                       return -ENOMEM;
-       }
-
-       if (filter->n_preds == MAX_FILTER_PRED) {
-               parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-               return -ENOSPC;
-       }
+       bool fail = true;
 
        list_for_each_entry(call, &ftrace_events, list) {
 
@@ -632,19 +631,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
                if (strcmp(call->system, system->name))
                        continue;
 
-               err = filter_add_pred(ps, call, pred);
-               if (err) {
-                       filter_free_subsystem_preds(system);
-                       parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-                       goto out;
-               }
-               replace_filter_string(call->filter, filter_string);
+               if (call->filter->no_reset)
+                       continue;
+
+               err = filter_add_pred(ps, call, pred, dry_run);
+               if (err)
+                       call->filter->no_reset = true;
+               else
+                       fail = false;
+
+               if (!dry_run)
+                       replace_filter_string(call->filter, filter_string);
        }
 
-       filter->preds[filter->n_preds] = pred;
-       filter->n_preds++;
-out:
-       return err;
+       if (fail) {
+               parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+               return err;
+       }
+       return 0;
 }
 
 static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1007,14 @@ static int check_preds(struct filter_parse_state *ps)
 static int replace_preds(struct event_subsystem *system,
                         struct ftrace_event_call *call,
                         struct filter_parse_state *ps,
-                        char *filter_string)
+                        char *filter_string,
+                        bool dry_run)
 {
        char *operand1 = NULL, *operand2 = NULL;
        struct filter_pred *pred;
        struct postfix_elt *elt;
        int err;
+       int n_preds = 0;
 
        err = check_preds(ps);
        if (err)
@@ -1027,24 +1033,14 @@ static int replace_preds(struct event_subsystem *system,
                        continue;
                }
 
+               if (n_preds++ == MAX_FILTER_PRED) {
+                       parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+                       return -ENOSPC;
+               }
+
                if (elt->op == OP_AND || elt->op == OP_OR) {
                        pred = create_logical_pred(elt->op);
-                       if (!pred)
-                               return -ENOMEM;
-                       if (call) {
-                               err = filter_add_pred(ps, call, pred);
-                               filter_free_pred(pred);
-                       } else {
-                               err = filter_add_subsystem_pred(ps, system,
-                                                       pred, filter_string);
-                               if (err)
-                                       filter_free_pred(pred);
-                       }
-                       if (err)
-                               return err;
-
-                       operand1 = operand2 = NULL;
-                       continue;
+                       goto add_pred;
                }
 
                if (!operand1 || !operand2) {
@@ -1053,17 +1049,15 @@ static int replace_preds(struct event_subsystem *system,
                }
 
                pred = create_pred(elt->op, operand1, operand2);
+add_pred:
                if (!pred)
                        return -ENOMEM;
-               if (call) {
-                       err = filter_add_pred(ps, call, pred);
-                       filter_free_pred(pred);
-               } else {
+               if (call)
+                       err = filter_add_pred(ps, call, pred, false);
+               else
                        err = filter_add_subsystem_pred(ps, system, pred,
-                                                       filter_string);
-                       if (err)
-                               filter_free_pred(pred);
-               }
+                                               filter_string, dry_run);
+               filter_free_pred(pred);
                if (err)
                        return err;
 
@@ -1103,7 +1097,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                goto out;
        }
 
-       err = replace_preds(NULL, call, ps, filter_string);
+       err = replace_preds(NULL, call, ps, filter_string, false);
        if (err)
                append_filter_err(ps, call->filter);
 
@@ -1127,7 +1121,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        mutex_lock(&event_mutex);
 
        if (!strcmp(strstrip(filter_string), "0")) {
-               filter_free_subsystem_preds(system);
+               filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
                remove_filter_string(system->filter);
                mutex_unlock(&event_mutex);
                return 0;
@@ -1138,7 +1132,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        if (!ps)
                goto out_unlock;
 
-       filter_free_subsystem_preds(system);
        replace_filter_string(system->filter, filter_string);
 
        parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1141,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out;
        }
 
-       err = replace_preds(system, NULL, ps, filter_string);
-       if (err)
+       filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
+
+       /* try to see the filter can be applied to which events */
+       err = replace_preds(system, NULL, ps, filter_string, true);
+       if (err) {
                append_filter_err(ps, system->filter);
+               goto out;
+       }
+
+       filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+
+       /* really apply the filter to the events */
+       err = replace_preds(system, NULL, ps, filter_string, false);
+       if (err) {
+               append_filter_err(ps, system->filter);
+               filter_free_subsystem_preds(system, 2);
+       }
 
 out:
        filter_opstack_clear(ps);
index 75ef000613c35db5c7cb8ae71b2f4169f0bfd6d8..5b01b94518fcf15cc36646b6d2b859ed64a072e6 100644 (file)
@@ -288,11 +288,9 @@ static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
                         struct ftrace_probe_ops *ops, void *data)
 {
-       char str[KSYM_SYMBOL_LEN];
        long count = (long)data;
 
-       kallsyms_lookup(ip, NULL, NULL, NULL, str);
-       seq_printf(m, "%s:", str);
+       seq_printf(m, "%pf:", (void *)ip);
 
        if (ops == &traceon_probe_ops)
                seq_printf(m, "traceon");
index 420ec34875795e42e3abc403596e4d98e8fb9c25..3f4a251b7d16ca6dc7cee3fab85ada12798d4dad 100644 (file)
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
        .opts = trace_opts
 };
 
-/* pid on the last trace processed */
+static struct trace_array *graph_array;
 
 
 /* Add a function return address to the trace stack on thread info.*/
@@ -166,10 +166,121 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
 
+static int __trace_graph_entry(struct trace_array *tr,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags,
+                               int pc)
+{
+       struct ftrace_event_call *call = &event_funcgraph_entry;
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent_entry *entry;
+
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return 0;
+
+       event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT,
+                                         sizeof(*entry), flags, pc);
+       if (!event)
+               return 0;
+       entry   = ring_buffer_event_data(event);
+       entry->graph_ent                        = *trace;
+       if (!filter_current_check_discard(call, entry, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
+
+       return 1;
+}
+
+int trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+       struct trace_array *tr = graph_array;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int ret;
+       int cpu;
+       int pc;
+
+       if (unlikely(!tr))
+               return 0;
+
+       if (!ftrace_trace_task(current))
+               return 0;
+
+       if (!ftrace_graph_addr(trace->func))
+               return 0;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               ret = __trace_graph_entry(tr, trace, flags, pc);
+       } else {
+               ret = 0;
+       }
+       /* Only do the atomic if it is not already set */
+       if (!test_tsk_trace_graph(current))
+               set_tsk_trace_graph(current);
+
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+
+       return ret;
+}
+
+static void __trace_graph_return(struct trace_array *tr,
+                               struct ftrace_graph_ret *trace,
+                               unsigned long flags,
+                               int pc)
+{
+       struct ftrace_event_call *call = &event_funcgraph_exit;
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *entry;
+
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+
+       event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET,
+                                         sizeof(*entry), flags, pc);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       entry->ret                              = *trace;
+       if (!filter_current_check_discard(call, entry, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct trace_array *tr = graph_array;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, trace, flags, pc);
+       }
+       if (!trace->depth)
+               clear_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
 static int graph_trace_init(struct trace_array *tr)
 {
-       int ret = register_ftrace_graph(&trace_graph_return,
-                                       &trace_graph_entry);
+       int ret;
+
+       graph_array = tr;
+       ret = register_ftrace_graph(&trace_graph_return,
+                                   &trace_graph_entry);
        if (ret)
                return ret;
        tracing_start_cmdline_record();
@@ -177,49 +288,30 @@ static int graph_trace_init(struct trace_array *tr)
        return 0;
 }
 
+void set_graph_array(struct trace_array *tr)
+{
+       graph_array = tr;
+}
+
 static void graph_trace_reset(struct trace_array *tr)
 {
        tracing_stop_cmdline_record();
        unregister_ftrace_graph();
 }
 
-static inline int log10_cpu(int nb)
-{
-       if (nb / 100)
-               return 3;
-       if (nb / 10)
-               return 2;
-       return 1;
-}
+static int max_bytes_for_cpu;
 
 static enum print_line_t
 print_graph_cpu(struct trace_seq *s, int cpu)
 {
-       int i;
        int ret;
-       int log10_this = log10_cpu(cpu);
-       int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
-
 
        /*
         * Start with a space character - to make it stand out
         * to the right a bit when trace output is pasted into
         * email:
         */
-       ret = trace_seq_printf(s, " ");
-
-       /*
-        * Tricky - we space the CPU field according to the max
-        * number of online CPUs. On a 2-cpu system it would take
-        * a maximum of 1 digit - on a 128 cpu system it would
-        * take up to 3 digits:
-        */
-       for (i = 0; i < log10_all - log10_this; i++) {
-               ret = trace_seq_printf(s, " ");
-               if (!ret)
-                       return TRACE_TYPE_PARTIAL_LINE;
-       }
-       ret = trace_seq_printf(s, "%d) ", cpu);
+       ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
@@ -565,11 +657,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
-       ret = seq_print_ip_sym(s, call->func, 0);
-       if (!ret)
-               return TRACE_TYPE_PARTIAL_LINE;
-
-       ret = trace_seq_printf(s, "();\n");
+       ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
@@ -612,11 +700,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
-       ret = seq_print_ip_sym(s, call->func, 0);
-       if (!ret)
-               return TRACE_TYPE_PARTIAL_LINE;
-
-       ret = trace_seq_printf(s, "() {\n");
+       ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
@@ -934,6 +1018,8 @@ static struct tracer graph_trace __read_mostly = {
 
 static __init int init_graph_trace(void)
 {
+       max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+
        return register_tracer(&graph_trace);
 }
 
index a98106dd979cfd7bd305cfab5a506fec61e95e46..e1285d7b5488b5e6bfb157a498338166e72c9340 100644 (file)
@@ -20,6 +20,34 @@ static int                   sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
 static int                     sched_stopped;
 
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+                          struct task_struct *prev,
+                          struct task_struct *next,
+                          unsigned long flags, int pc)
+{
+       struct ftrace_event_call *call = &event_context_switch;
+       struct ring_buffer_event *event;
+       struct ctx_switch_entry *entry;
+
+       event = trace_buffer_lock_reserve(tr, TRACE_CTX,
+                                         sizeof(*entry), flags, pc);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       entry->prev_pid                 = prev->pid;
+       entry->prev_prio                = prev->prio;
+       entry->prev_state               = prev->state;
+       entry->next_pid                 = next->pid;
+       entry->next_prio                = next->prio;
+       entry->next_state               = next->state;
+       entry->next_cpu = task_cpu(next);
+
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, flags, pc);
+}
+
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
                        struct task_struct *next)
@@ -49,6 +77,35 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
        local_irq_restore(flags);
 }
 
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+                          struct task_struct *wakee,
+                          struct task_struct *curr,
+                          unsigned long flags, int pc)
+{
+       struct ftrace_event_call *call = &event_wakeup;
+       struct ring_buffer_event *event;
+       struct ctx_switch_entry *entry;
+
+       event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
+                                         sizeof(*entry), flags, pc);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       entry->prev_pid                 = curr->pid;
+       entry->prev_prio                = curr->prio;
+       entry->prev_state               = curr->state;
+       entry->next_pid                 = wakee->pid;
+       entry->next_prio                = wakee->prio;
+       entry->next_state               = wakee->state;
+       entry->next_cpu                 = task_cpu(wakee);
+
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
+       ftrace_trace_stack(tr, flags, 6, pc);
+       ftrace_trace_userstack(tr, flags, pc);
+}
+
 static void
 probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 {
index 00dd6485bdd7e7abf390a6fcbe1c27d626a8531e..d2cdbabb4eadd4b0780950b46e9e4deda92198be 100644 (file)
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
         * to detect and recover from possible hangs
         */
        tracing_reset_online_cpus(tr);
+       set_graph_array(tr);
        ret = register_ftrace_graph(&trace_graph_return,
                                    &trace_graph_entry_watchdog);
        if (ret) {
index 6a2a9d484cd6bb950a11b99910649e2cec4c42ff..0da1cff08d67290121f0f7d13dc49b38185964ed 100644 (file)
@@ -234,15 +234,8 @@ static void t_stop(struct seq_file *m, void *p)
 static int trace_lookup_stack(struct seq_file *m, long i)
 {
        unsigned long addr = stack_dump_trace[i];
-#ifdef CONFIG_KALLSYMS
-       char str[KSYM_SYMBOL_LEN];
 
-       sprint_symbol(str, addr);
-
-       return seq_printf(m, "%s\n", str);
-#else
-       return seq_printf(m, "%p\n", (void*)addr);
-#endif
+       return seq_printf(m, "%pF\n", (void *)addr);
 }
 
 static void print_disabled(struct seq_file *m)
index aea321c82fa098fa01d7b7fd8c7f21ea59ebbf0c..07c60b09258f0ddd68542bf4b28b41d8ae34f654 100644 (file)
@@ -49,7 +49,8 @@ static struct dentry          *stat_dir;
  * but it will at least advance closer to the next one
  * to be released.
  */
-static struct rb_node *release_next(struct rb_node *node)
+static struct rb_node *release_next(struct tracer_stat *ts,
+                                   struct rb_node *node)
 {
        struct stat_node *snode;
        struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
                        parent->rb_right = NULL;
 
                snode = container_of(node, struct stat_node, node);
+               if (ts->stat_release)
+                       ts->stat_release(snode->stat);
                kfree(snode);
 
                return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
        struct rb_node *node = session->stat_root.rb_node;
 
        while (node)
-               node = release_next(node);
+               node = release_next(session->ts, node);
 
        session->stat_root = RB_ROOT;
 }
index f3546a2cd826bc6ae0ecb85c3c631bd411884aab..8f03914b9a6a6223622d15da4a3612f6e7e15ed3 100644 (file)
@@ -18,6 +18,8 @@ struct tracer_stat {
        int                     (*stat_cmp)(void *p1, void *p2);
        /* Print a stat entry */
        int                     (*stat_show)(struct seq_file *s, void *p);
+       /* Release an entry */
+       void                    (*stat_release)(void *stat);
        /* Print the headers of your stat entries */
        int                     (*stat_headers)(struct seq_file *s);
 };
index 97fcea4acce156228e29490345373938d92277d8..40cafb07dffd11e533b17ef5c7fcf6546aef00f5 100644 (file)
@@ -9,6 +9,7 @@
 #include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
+#include <linux/kref.h>
 #include "trace_stat.h"
 #include "trace.h"
 
@@ -16,6 +17,7 @@
 /* A cpu workqueue thread */
 struct cpu_workqueue_stats {
        struct list_head            list;
+       struct kref                 kref;
        int                         cpu;
        pid_t                       pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
 static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
 #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
 
+static void cpu_workqueue_stat_free(struct kref *kref)
+{
+       kfree(container_of(kref, struct cpu_workqueue_stats, kref));
+}
+
 /* Insertion of a work */
 static void
 probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
                return;
        }
        INIT_LIST_HEAD(&cws->list);
+       kref_init(&cws->kref);
        cws->cpu = cpu;
-
        cws->pid = wq_thread->pid;
 
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
                                                        list) {
                if (node->pid == wq_thread->pid) {
                        list_del(&node->list);
-                       kfree(node);
+                       kref_put(&node->kref, cpu_workqueue_stat_free);
                        goto found;
                }
        }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 
-       if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+       if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
                ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
                                 struct cpu_workqueue_stats, list);
+               kref_get(&ret->kref);
+       }
 
        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
 static void *workqueue_stat_next(void *prev, int idx)
 {
        struct cpu_workqueue_stats *prev_cws = prev;
+       struct cpu_workqueue_stats *ret;
        int cpu = prev_cws->cpu;
        unsigned long flags;
-       void *ret = NULL;
 
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
        if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
                                return NULL;
                } while (!(ret = workqueue_stat_start_cpu(cpu)));
                return ret;
+       } else {
+               ret = list_entry(prev_cws->list.next,
+                                struct cpu_workqueue_stats, list);
+               kref_get(&ret->kref);
        }
        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
-       return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
-                         list);
+       return ret;
 }
 
 static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
        return 0;
 }
 
+static void workqueue_stat_release(void *stat)
+{
+       struct cpu_workqueue_stats *node = stat;
+
+       kref_put(&node->kref, cpu_workqueue_stat_free);
+}
+
 static int workqueue_stat_headers(struct seq_file *s)
 {
        seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
        .stat_start = workqueue_stat_start,
        .stat_next = workqueue_stat_next,
        .stat_show = workqueue_stat_show,
+       .stat_release = workqueue_stat_release,
        .stat_headers = workqueue_stat_headers
 };
 
index 911ba7ffab842dd3017229e269851501f19c145f..090d300d7394d1d890057633826ff5f6b5f03eaa 100755 (executable)
@@ -57,7 +57,6 @@
 #        call mcount  (offset: 0x5)
 #        [...]
 #        ret
-#  .globl my_func
 #  other_func:
 #        [...]
 #        call mcount (offset: 0x1b)