| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- /* Copyright 2020 Google LLC. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- #ifndef RUY_RUY_PROFILER_INSTRUMENTATION_H_
- #define RUY_RUY_PROFILER_INSTRUMENTATION_H_
- #ifdef RUY_PROFILER
- #include <cstdio>
- #include <mutex>
- #include <vector>
- #endif
- namespace ruy {
- namespace profiler {
- #ifdef RUY_PROFILER
- // A label is how a code scope is annotated to appear in profiles.
- // The stacks that are sampled by the profiler are stacks of such labels.
- // A label consists of a literal string, plus optional integer arguments.
- class Label {
- public:
- Label() {}
- template <typename... Args>
- explicit Label(Args... args) {
- Set(args...);
- }
- void Set(const char* format) {
- format_ = format;
- args_count_ = 0;
- }
- template <typename... Args>
- void Set(const char* format, Args... args) {
- format_ = format;
- args_count_ = sizeof...(args);
- SetArgs(0, args...);
- }
- void operator=(const Label& other);
- bool operator==(const Label& other) const;
- std::string Formatted() const;
- const char* format() const { return format_; }
- private:
- void SetArgs(int position, int arg0) { args_[position] = arg0; }
- template <typename... Args>
- void SetArgs(int position, int arg0, Args... args) {
- SetArgs(position, arg0);
- SetArgs(position + 1, args...);
- }
- static constexpr int kMaxArgs = 4;
- const char* format_ = nullptr;
- int args_count_ = 0;
- int args_[kMaxArgs];
- };
- namespace detail {
- // Forward-declaration, see class ThreadStack below.
- class ThreadStack;
- bool& GlobalIsProfilerRunning();
- // Returns the global vector of pointers to all stacks, there being one stack
- // per thread executing instrumented code.
- std::vector<ThreadStack*>* GlobalAllThreadStacks();
- // Returns the mutex to be locked around any access to GlobalAllThreadStacks().
- std::mutex* GlobalsMutex();
- // Returns the thread-local stack, specific to the current thread.
- ThreadStack* ThreadLocalThreadStack();
- // This 'stack' is what may be more appropriately called a 'pseudostack':
- // It contains Label entries that are 'manually' entered by instrumentation
- // code. It's unrelated to real call stacks.
- struct Stack {
- std::uint32_t id = 0;
- static constexpr int kMaxSize = 64;
- int size = 0;
- Label labels[kMaxSize];
- };
- // Returns the buffer byte size required by CopyToSample.
- int GetBufferSize(const Stack& stack);
- // Copies this Stack into a byte buffer, called a 'sample'.
- void CopyToBuffer(const Stack& stack, char* dst);
- // Populates this Stack from an existing sample buffer, typically
- // produced by CopyToSample.
- void ReadFromBuffer(const char* src, Stack* stack);
- // ThreadStack is meant to be used as a thread-local singleton, assigning to
- // each thread a Stack object holding its pseudo-stack of profile labels,
- // plus a mutex allowing to synchronize accesses to this pseudo-stack between
- // this thread and a possible profiler thread sampling it.
- class ThreadStack {
- public:
- ThreadStack();
- ~ThreadStack();
- const Stack& stack() const { return stack_; }
- // Returns the mutex to lock around any access to this stack. Each stack is
- // accessed by potentially two threads: the thread that it belongs to
- // (which calls Push and Pop) and the profiler thread during profiling
- // (which calls CopyToSample).
- std::mutex& Mutex() const { return mutex_; }
- // Pushes a new label on the top of this Stack.
- template <typename... Args>
- void Push(Args... args) {
- // This mutex locking is needed to guard against race conditions as both
- // the current thread and the profiler thread may be concurrently accessing
- // this stack. In addition to that, this mutex locking also serves the other
- // purpose of acting as a barrier (of compiler code reordering, of runtime
- // CPU instruction reordering, and of memory access reordering), which
- // gives a measure of correctness to this profiler. The downside is some
- // latency. As this lock will be uncontended most of the times, the cost
- // should be roughly that of an sequentially-consistent atomic access,
- // comparable to an access to the level of CPU data cache that is shared
- // among all cores, typically 60 cycles on current ARM CPUs, plus side
- // effects from barrier instructions.
- std::lock_guard<std::mutex> lock(mutex_);
- // Avoid overrunning the stack, even in 'release' builds. This profiling
- // instrumentation code should not ship in release builds anyway, the
- // overhead of this check is negligible, and overrunning a stack array would
- // be bad.
- if (stack_.size >= Stack::kMaxSize) {
- abort();
- }
- stack_.labels[stack_.size++].Set(args...);
- }
- // Pops the top-most label from this Stack.
- void Pop() {
- // See the comment in Push about this lock. While it would be tempting to
- // try to remove this lock and just atomically decrement size_ with a
- // store-release, that would not necessarily be a substitute for all of the
- // purposes that this lock serves, or if it was done carefully to serve all
- // of the same purposes, then that wouldn't be faster than this (mostly
- // uncontended) lock.
- std::lock_guard<std::mutex> lock(mutex_);
- stack_.size--;
- }
- private:
- mutable std::mutex mutex_;
- Stack stack_;
- };
- } // namespace detail
- // RAII user-facing way to construct Labels associated with their life scope
- // and get them pushed to / popped from the current thread stack.
- class ScopeLabel {
- public:
- template <typename... Args>
- ScopeLabel(Args... args) : thread_stack_(detail::ThreadLocalThreadStack()) {
- thread_stack_->Push(args...);
- }
- ~ScopeLabel() { thread_stack_->Pop(); }
- private:
- detail::ThreadStack* thread_stack_;
- };
- #else // no RUY_PROFILER
- class ScopeLabel {
- public:
- template <typename... Args>
- explicit ScopeLabel(Args...) {}
- // This destructor is needed to consistently silence clang's -Wunused-variable
- // which seems to trigger semi-randomly.
- ~ScopeLabel() {}
- };
- #endif
- } // namespace profiler
- } // namespace ruy
- #endif // RUY_RUY_PROFILER_INSTRUMENTATION_H_
|