[ML] Add total ML memory to ML info (#65214)

This change adds an extra piece of information, limits.total_ml_memory, to the ML info response. This returns the total amount of memory that ML is permitted to use for native processes across all ML nodes in the cluster. Some of this may already be in use; the value returned is total, not available ML memory. Backport of #65195
elastic · Nov 18, 2020 · d8f549c · d8f549c
1 parent 7df9873
commit d8f549c
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 11 deletions.
diff --git a/docs/reference/ml/anomaly-detection/apis/get-ml-info.asciidoc b/docs/reference/ml/anomaly-detection/apis/get-ml-info.asciidoc
@@ -30,7 +30,8 @@ privileges. See <<security-privileges>>, <<built-in-roles>> and
 This endpoint is designed to be used by a user interface that needs to fully
 understand machine learning configurations where some options are not specified,
 meaning that the defaults should be used.  This endpoint may be used to find out
-what those defaults are.
+what those defaults are.  It also provides information about the maximum size
+of {ml} jobs that could run in the current cluster configuration.
 
 [[get-ml-info-example]]
 == {api-examples-title}
@@ -115,11 +116,13 @@ This is a possible response:
     "build_hash": "99a07c016d5a73"
   },
   "limits" : {
-    "effective_max_model_memory_limit": "28961mb"
+    "effective_max_model_memory_limit": "28961mb",
+    "total_ml_memory": "86883mb"
   }
 }
 ----
 // TESTRESPONSE[s/"upgrade_mode": false/"upgrade_mode": $body.upgrade_mode/]
 // TESTRESPONSE[s/"version": "7.0.0",/"version": "$body.native_code.version",/]
 // TESTRESPONSE[s/"build_hash": "99a07c016d5a73"/"build_hash": "$body.native_code.build_hash"/]
 // TESTRESPONSE[s/"effective_max_model_memory_limit": "28961mb"/"effective_max_model_memory_limit": "$body.limits.effective_max_model_memory_limit"/]
+// TESTRESPONSE[s/"total_ml_memory": "86883mb"/"total_ml_memory": "$body.limits.total_ml_memory"/]
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlInfoAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlInfoAction.java
@@ -15,6 +15,7 @@
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.env.Environment;
@@ -128,6 +129,23 @@ private Map<String, Object> datafeedsDefaults() {
         return anomalyDetectorsDefaults;
     }
 
+    static ByteSizeValue calculateTotalMlMemory(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
+
+        long totalMlMemory = 0;
+
+        for (DiscoveryNode node : nodes) {
+            OptionalLong limit = NativeMemoryCalculator.allowedBytesForMl(node, clusterSettings);
+            if (limit.isPresent() == false) {
+                continue;
+            }
+            totalMlMemory += limit.getAsLong();
+        }
+
+        // Round down to a whole number of megabytes, since we generally deal with model
+        // memory limits in whole megabytes
+        return ByteSizeValue.ofMb(ByteSizeUnit.BYTES.toMB(totalMlMemory));
+    }
+
     static ByteSizeValue calculateEffectiveMaxModelMemoryLimit(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
 
         long maxMlMemory = -1;
@@ -148,7 +166,7 @@ static ByteSizeValue calculateEffectiveMaxModelMemoryLimit(ClusterSettings clust
 
         maxMlMemory -= Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes());
         maxMlMemory -= MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes();
-        return ByteSizeValue.ofMb(Math.max(0L, maxMlMemory) / 1024 / 1024);
+        return ByteSizeValue.ofMb(ByteSizeUnit.BYTES.toMB(Math.max(0L, maxMlMemory)));
     }
 
     private Map<String, Object> limits() {
@@ -166,6 +184,8 @@ private Map<String, Object> limits() {
         if (effectiveMaxModelMemoryLimit != null) {
             limits.put("effective_max_model_memory_limit", effectiveMaxModelMemoryLimit.getStringRep());
         }
+        limits.put("total_ml_memory",
+            calculateTotalMlMemory(clusterService.getClusterSettings(), clusterService.state().getNodes()).getStringRep());
         return limits;
     }
 }
diff --git a/...plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportMlInfoActionTests.java b/...plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportMlInfoActionTests.java
@@ -24,6 +24,7 @@
 
 import static org.elasticsearch.xpack.ml.MachineLearning.MAX_MACHINE_MEMORY_PERCENT;
 import static org.elasticsearch.xpack.ml.MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT;
+import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.nullValue;
@@ -36,7 +37,8 @@ public void testCalculateEffectiveMaxModelMemoryLimit() {
         ClusterSettings clusterSettings = new ClusterSettings(
             Settings.builder().put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
             Sets.newHashSet(MAX_MACHINE_MEMORY_PERCENT, USE_AUTO_MACHINE_MEMORY_PERCENT));
-        long highestMlMachineMemory = -1;
+        long highestMlMachineMemoryBytes = -1;
+        long totalMlMemoryBytes = 0;
 
         DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
         for (int i = randomIntBetween(1, 10); i > 0; --i) {
@@ -49,7 +51,8 @@ public void testCalculateEffectiveMaxModelMemoryLimit() {
             } else {
                 // ML node
                 long machineMemory = randomLongBetween(2000000000L, 100000000000L);
-                highestMlMachineMemory = Math.max(machineMemory, highestMlMachineMemory);
+                highestMlMachineMemoryBytes = Math.max(machineMemory, highestMlMachineMemoryBytes);
+                totalMlMemoryBytes += machineMemory * mlMemoryPercent / 100;
                 builder.add(new DiscoveryNode(nodeName, nodeId, ta,
                     Collections.singletonMap(MachineLearning.MACHINE_MEMORY_NODE_ATTR, String.valueOf(machineMemory)),
                     Collections.emptySet(), Version.CURRENT));
@@ -59,14 +62,19 @@ public void testCalculateEffectiveMaxModelMemoryLimit() {
 
         ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
 
-        if (highestMlMachineMemory < 0) {
+        if (highestMlMachineMemoryBytes < 0) {
             assertThat(effectiveMaxModelMemoryLimit, nullValue());
         } else {
             assertThat(effectiveMaxModelMemoryLimit, notNullValue());
             assertThat(effectiveMaxModelMemoryLimit.getBytes()
                     + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
                     + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
-                lessThanOrEqualTo(highestMlMachineMemory * mlMemoryPercent / 100));
+                lessThanOrEqualTo(highestMlMachineMemoryBytes * mlMemoryPercent / 100));
         }
+
+        ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
+
+        assertThat(totalMlMemory, notNullValue());
+        assertThat(totalMlMemory, is(ByteSizeValue.ofMb(totalMlMemoryBytes / (1024 * 1024))));
     }
 }
diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/ml_info.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/ml_info.yml
@@ -17,8 +17,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - is_false: limits.max_model_memory_limit
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -36,8 +37,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - match: { limits.max_model_memory_limit: "512mb" }
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -55,8 +57,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - match: { limits.max_model_memory_limit: "6gb" }
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -74,8 +77,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - match: { limits.max_model_memory_limit: "6gb" }
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -95,4 +99,5 @@ teardown:
   - match: { limits.max_model_memory_limit: "1mb" }
   # This time we can assert an exact value for the next one because the hard limit is so low
   - match: { limits.effective_max_model_memory_limit: "1mb" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }