instrumentation.h 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. /* Copyright 2020 Google LLC. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef RUY_RUY_PROFILER_INSTRUMENTATION_H_
  13. #define RUY_RUY_PROFILER_INSTRUMENTATION_H_
  14. #ifdef RUY_PROFILER
  15. #include <cstdio>
  16. #include <mutex>
  17. #include <vector>
  18. #endif
  19. namespace ruy {
  20. namespace profiler {
  21. #ifdef RUY_PROFILER
  22. // A label is how a code scope is annotated to appear in profiles.
  23. // The stacks that are sampled by the profiler are stacks of such labels.
  24. // A label consists of a literal string, plus optional integer arguments.
  25. class Label {
  26. public:
  27. Label() {}
  28. template <typename... Args>
  29. explicit Label(Args... args) {
  30. Set(args...);
  31. }
  32. void Set(const char* format) {
  33. format_ = format;
  34. args_count_ = 0;
  35. }
  36. template <typename... Args>
  37. void Set(const char* format, Args... args) {
  38. format_ = format;
  39. args_count_ = sizeof...(args);
  40. SetArgs(0, args...);
  41. }
  42. void operator=(const Label& other);
  43. bool operator==(const Label& other) const;
  44. std::string Formatted() const;
  45. const char* format() const { return format_; }
  46. private:
  47. void SetArgs(int position, int arg0) { args_[position] = arg0; }
  48. template <typename... Args>
  49. void SetArgs(int position, int arg0, Args... args) {
  50. SetArgs(position, arg0);
  51. SetArgs(position + 1, args...);
  52. }
  53. static constexpr int kMaxArgs = 4;
  54. const char* format_ = nullptr;
  55. int args_count_ = 0;
  56. int args_[kMaxArgs];
  57. };
  58. namespace detail {
  59. // Forward-declaration, see class ThreadStack below.
  60. class ThreadStack;
  61. bool& GlobalIsProfilerRunning();
  62. // Returns the global vector of pointers to all stacks, there being one stack
  63. // per thread executing instrumented code.
  64. std::vector<ThreadStack*>* GlobalAllThreadStacks();
  65. // Returns the mutex to be locked around any access to GlobalAllThreadStacks().
  66. std::mutex* GlobalsMutex();
  67. // Returns the thread-local stack, specific to the current thread.
  68. ThreadStack* ThreadLocalThreadStack();
  69. // This 'stack' is what may be more appropriately called a 'pseudostack':
  70. // It contains Label entries that are 'manually' entered by instrumentation
  71. // code. It's unrelated to real call stacks.
  72. struct Stack {
  73. std::uint32_t id = 0;
  74. static constexpr int kMaxSize = 64;
  75. int size = 0;
  76. Label labels[kMaxSize];
  77. };
  78. // Returns the buffer byte size required by CopyToSample.
  79. int GetBufferSize(const Stack& stack);
  80. // Copies this Stack into a byte buffer, called a 'sample'.
  81. void CopyToBuffer(const Stack& stack, char* dst);
  82. // Populates this Stack from an existing sample buffer, typically
  83. // produced by CopyToSample.
  84. void ReadFromBuffer(const char* src, Stack* stack);
  85. // ThreadStack is meant to be used as a thread-local singleton, assigning to
  86. // each thread a Stack object holding its pseudo-stack of profile labels,
  87. // plus a mutex allowing to synchronize accesses to this pseudo-stack between
  88. // this thread and a possible profiler thread sampling it.
  89. class ThreadStack {
  90. public:
  91. ThreadStack();
  92. ~ThreadStack();
  93. const Stack& stack() const { return stack_; }
  94. // Returns the mutex to lock around any access to this stack. Each stack is
  95. // accessed by potentially two threads: the thread that it belongs to
  96. // (which calls Push and Pop) and the profiler thread during profiling
  97. // (which calls CopyToSample).
  98. std::mutex& Mutex() const { return mutex_; }
  99. // Pushes a new label on the top of this Stack.
  100. template <typename... Args>
  101. void Push(Args... args) {
  102. // This mutex locking is needed to guard against race conditions as both
  103. // the current thread and the profiler thread may be concurrently accessing
  104. // this stack. In addition to that, this mutex locking also serves the other
  105. // purpose of acting as a barrier (of compiler code reordering, of runtime
  106. // CPU instruction reordering, and of memory access reordering), which
  107. // gives a measure of correctness to this profiler. The downside is some
  108. // latency. As this lock will be uncontended most of the times, the cost
  109. // should be roughly that of an sequentially-consistent atomic access,
  110. // comparable to an access to the level of CPU data cache that is shared
  111. // among all cores, typically 60 cycles on current ARM CPUs, plus side
  112. // effects from barrier instructions.
  113. std::lock_guard<std::mutex> lock(mutex_);
  114. // Avoid overrunning the stack, even in 'release' builds. This profiling
  115. // instrumentation code should not ship in release builds anyway, the
  116. // overhead of this check is negligible, and overrunning a stack array would
  117. // be bad.
  118. if (stack_.size >= Stack::kMaxSize) {
  119. abort();
  120. }
  121. stack_.labels[stack_.size++].Set(args...);
  122. }
  123. // Pops the top-most label from this Stack.
  124. void Pop() {
  125. // See the comment in Push about this lock. While it would be tempting to
  126. // try to remove this lock and just atomically decrement size_ with a
  127. // store-release, that would not necessarily be a substitute for all of the
  128. // purposes that this lock serves, or if it was done carefully to serve all
  129. // of the same purposes, then that wouldn't be faster than this (mostly
  130. // uncontended) lock.
  131. std::lock_guard<std::mutex> lock(mutex_);
  132. stack_.size--;
  133. }
  134. private:
  135. mutable std::mutex mutex_;
  136. Stack stack_;
  137. };
  138. } // namespace detail
  139. // RAII user-facing way to construct Labels associated with their life scope
  140. // and get them pushed to / popped from the current thread stack.
  141. class ScopeLabel {
  142. public:
  143. template <typename... Args>
  144. ScopeLabel(Args... args) : thread_stack_(detail::ThreadLocalThreadStack()) {
  145. thread_stack_->Push(args...);
  146. }
  147. ~ScopeLabel() { thread_stack_->Pop(); }
  148. private:
  149. detail::ThreadStack* thread_stack_;
  150. };
  151. #else // no RUY_PROFILER
  152. class ScopeLabel {
  153. public:
  154. template <typename... Args>
  155. explicit ScopeLabel(Args...) {}
  156. // This destructor is needed to consistently silence clang's -Wunused-variable
  157. // which seems to trigger semi-randomly.
  158. ~ScopeLabel() {}
  159. };
  160. #endif
  161. } // namespace profiler
  162. } // namespace ruy
  163. #endif // RUY_RUY_PROFILER_INSTRUMENTATION_H_