microsoft · yuslepukhin · Jun 23, 2022 · Jun 11, 2022 · Jun 15, 2022 · Jun 15, 2022
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
diff --git a/include/onnxruntime/core/platform/threadpool.h b/include/onnxruntime/core/platform/threadpool.h
@@ -221,6 +221,14 @@ class ThreadPool {
                   "Per-thread state should be trivially destructible");
   };
 
+  // The below API allows to disable spinning
+  // This is used to support real-time scenarios where
+  // spinning between relatively infrequent requests
+  // contributes to high CPU usage while not processing anything.
+  void EnableSpinning();
+
+  void DisableSpinning();
+
   // Schedules fn() for execution in the pool of threads.  The function may run
   // synchronously if it cannot be enqueued.  This will occur if the thread pool's
   // degree-of-parallelism is 1, but it may also occur for implementation-dependent

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -114,6 +114,13 @@ static const char* const kOrtSessionOptionsConfigNnapiEpPartitioningStopOps = "e
 // Available since version 1.11.
 static const char* const kOrtSessionOptionsConfigDynamicBlockBase = "session.dynamic_block_base";
 
+// This option allows to decrease CPU usage between infrequent
+// requests and forces any TP threads spinning stop immediately when the last of
+// concurrent Run() call returns.
+// Spinning is restarted on the next Run() call.
+// Applies only to internal thread-pools
+static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.force_spinning_stop_between_runs";
+
 // "1": all inconsistencies encountered during shape and type inference
 // will result in failures.
 // "0": in some cases warnings will be logged but processing will continue. The default.

diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc
@@ -650,6 +650,18 @@ std::string ThreadPool::StopProfiling(concurrency::ThreadPool* tp) {
   }
 }
 
+void ThreadPool::EnableSpinning() {
+  if (extended_eigen_threadpool_) {
+    extended_eigen_threadpool_->EnableSpinning();
+  }
+}
+
+void ThreadPool::DisableSpinning() {
+  if (extended_eigen_threadpool_) {
+    extended_eigen_threadpool_->DisableSpinning();
+  }
+}
+
 // Return the number of threads created by the pool.
 int ThreadPool::NumThreads() const {
   if (underlying_threadpool_) {

diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@@ -594,12 +594,6 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
   }
 
   {
-    // Enter a parallel section encompassing the kernels invoked
-    // below.  This lets the runtime system amortize loop entry/exit
-    // costs over a series of short kernels, and promotes cache
-    // affinity between iterations of successive loops.
-    onnxruntime::concurrency::ThreadPool::ParallelSection ps(ttp_);
-
     // for each item in sequence run all calculations
     for (int step = 0; step < max_sequence_length; step++) {
 #if defined(DUMP_MATRIXES)

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -264,6 +264,7 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
   }
 
   use_per_session_threads_ = session_options.use_per_session_threads;
+  force_spinning_stop_between_runs_ = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigForceSpinningStop, "0") == "1";
 
   if (use_per_session_threads_) {
     LOGS(*session_logger_, INFO) << "Creating and using per session threadpools since use_per_session_threads_ is true";
@@ -1930,6 +1931,44 @@ Status InferenceSession::Run(const RunOptions& run_options,
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
       session_state_->IncrementGraphExecutionCounter();
 #endif
+
+      struct ThreadPoolSpinningSwitch {
+        using PS = onnxruntime::concurrency::ThreadPool::ParallelSection;
+        concurrency::ThreadPool* intra_tp_;
+        concurrency::ThreadPool* inter_tp_;
+        std::atomic_int32_t& counter_ref_;
+        // Use this to jump start threads and amortize the costs
+        // of initialization between the kernels
+        // note this prevents using explicit PS in the nodes
+        // or an additional PS for inter op thread-pool
+        std::optional<PS> ps_intra_;
+        ThreadPoolSpinningSwitch(concurrency::ThreadPool* intra_tp,
+                                 concurrency::ThreadPool* inter_tp,
+                                 std::atomic_int32_t& ref) noexcept
+            : intra_tp_(intra_tp), inter_tp_(inter_tp), counter_ref_(ref) {
+          if (counter_ref_.fetch_add(1, std::memory_order_relaxed) == 0) {
+            if (intra_tp_) intra_tp_->EnableSpinning();
+            if (inter_tp_) inter_tp_->EnableSpinning();
+          }
+          if (intra_tp_) {
+            ps_intra_.emplace(intra_tp_);
+          }
+        }
+        ~ThreadPoolSpinningSwitch() {
+          ps_intra_.reset();
+          if (1 == counter_ref_.fetch_sub(1, std::memory_order_acq_rel)) {
+            if (intra_tp_) intra_tp_->DisableSpinning();
+            if (inter_tp_) inter_tp_->DisableSpinning();
+          }
+        }
+      };
+
+      std::optional<ThreadPoolSpinningSwitch> tp_starter;
+      if (force_spinning_stop_between_runs_) {
+        concurrency::ThreadPool* intra_tp_ = (use_per_session_threads_) ? thread_pool_.get() : intra_op_thread_pool_from_env_;
+        concurrency::ThreadPool* inter_tp = (use_per_session_threads_) ? inter_op_thread_pool_.get() : inter_op_thread_pool_from_env_;
+        tp_starter.emplace(intra_tp_, inter_tp, invocation_refcounter_);
+      }
       ORT_CHECK_AND_SET_RETVAL(utils::ExecuteGraph(*session_state_, feeds_fetches_manager, feeds, *p_fetches,
                                                    session_options_.execution_mode, run_options.terminate, run_logger,
                                                    run_options.only_execute_path_to_fetches));

diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
@@ -664,6 +664,17 @@ class InferenceSession {
   std::basic_string<ORTCHAR_T> thread_pool_name_;
   std::basic_string<ORTCHAR_T> inter_thread_pool_name_;
 
+  // This counter increments each time Run() is issued and decrements each time
+  // Run() exits. For concurrent executions this reference counter will be greater than 1
+  // This is currently used by thread-pools to find out if there is a Run() call in progress
+  // so it can adjust it spinning policies.
+  std::atomic_int32_t invocation_refcounter_{0};
+  // This option allows to decrease CPU usage between infrequent
+  // requests and forces any TP threads spinning stop immediately when the last of
+  // concurrent Run() call returns.
+  // Spinning is restarted on the next Run()
+  bool force_spinning_stop_between_runs_ = false;
+
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> thread_pool_;
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> inter_op_thread_pool_;