Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] make xpack.ml.max_ml_node_size and xpack.ml.use_auto_machine_memory_percent dynamically settable #66132

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ public Set<DiscoveryNodeRole> getRoles() {
public static final Setting<Boolean> USE_AUTO_MACHINE_MEMORY_PERCENT = Setting.boolSetting(
"xpack.ml.use_auto_machine_memory_percent",
false,
Property.Dynamic,
Property.NodeScope);
public static final Setting<Integer> MAX_LAZY_ML_NODES =
Setting.intSetting("xpack.ml.max_lazy_ml_nodes", 0, 0, Property.Dynamic, Property.NodeScope);
Expand Down Expand Up @@ -497,6 +498,7 @@ public Set<DiscoveryNodeRole> getRoles() {
public static final Setting<ByteSizeValue> MAX_ML_NODE_SIZE = Setting.byteSizeSetting(
"xpack.ml.max_ml_node_size",
ByteSizeValue.ZERO,
Property.Dynamic,
Property.NodeScope);

private static final Logger logger = LogManager.getLogger(MachineLearning.class);
Expand Down Expand Up @@ -579,7 +581,11 @@ public Settings additionalSettings() {
// This is not used in v7 and higher, but users are still prevented from setting it directly to avoid confusion
disallowMlNodeAttributes(mlEnabledNodeAttrName);
} else {
disallowMlNodeAttributes(mlEnabledNodeAttrName, maxOpenJobsPerNodeNodeAttrName, machineMemoryAttrName);
disallowMlNodeAttributes(mlEnabledNodeAttrName,
maxOpenJobsPerNodeNodeAttrName,
machineMemoryAttrName,
jvmSizeAttrName
);
}
return additionalSettings.build();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ public class MlAutoscalingDeciderService implements AutoscalingDeciderService,
private final NodeLoadDetector nodeLoadDetector;
private final MlMemoryTracker mlMemoryTracker;
private final Supplier<Long> timeSupplier;
private final boolean useAuto;

private volatile boolean isMaster;
private volatile boolean running;
private volatile int maxMachineMemoryPercent;
private volatile int maxOpenJobs;
private volatile boolean useAuto;
private volatile long lastTimeToScale;
private volatile long scaleDownDetected;

Expand All @@ -99,6 +99,7 @@ public MlAutoscalingDeciderService(MlMemoryTracker memoryTracker, Settings setti
clusterService.getClusterSettings().addSettingsUpdateConsumer(MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
this::setMaxMachineMemoryPercent);
clusterService.getClusterSettings().addSettingsUpdateConsumer(MachineLearning.MAX_OPEN_JOBS_PER_NODE, this::setMaxOpenJobs);
clusterService.getClusterSettings().addSettingsUpdateConsumer(MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT, this::setUseAuto);
clusterService.addLocalNodeMasterListener(this);
clusterService.addLifecycleListener(new LifecycleListener() {
@Override
Expand Down Expand Up @@ -206,6 +207,10 @@ void setMaxOpenJobs(int maxOpenJobs) {
this.maxOpenJobs = maxOpenJobs;
}

void setUseAuto(boolean useAuto) {
this.useAuto = useAuto;
}

@Override
public void onMaster() {
isMaster = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.persistent.PersistentTaskParams;
import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
import org.elasticsearch.persistent.PersistentTasksExecutor;
Expand Down Expand Up @@ -60,16 +61,16 @@ public static List<String> verifyIndicesPrimaryShardsAreActive(ClusterState clus
return unavailableIndices;
}

protected final boolean useAutoMemoryPercentage;

protected final MlMemoryTracker memoryTracker;
protected final IndexNameExpressionResolver expressionResolver;

protected volatile int maxConcurrentJobAllocations;
protected volatile int maxMachineMemoryPercent;
protected volatile int maxLazyMLNodes;
protected volatile boolean useAutoMemoryPercentage;
protected volatile long maxNodeMemory;
protected volatile int maxOpenJobs;
protected final long maxNodeMemory;

protected AbstractJobPersistentTasksExecutor(String taskName,
String executor,
Expand All @@ -92,6 +93,8 @@ protected AbstractJobPersistentTasksExecutor(String taskName,
.addSettingsUpdateConsumer(MachineLearning.MAX_MACHINE_MEMORY_PERCENT, this::setMaxMachineMemoryPercent);
clusterService.getClusterSettings().addSettingsUpdateConsumer(MachineLearning.MAX_LAZY_ML_NODES, this::setMaxLazyMLNodes);
clusterService.getClusterSettings().addSettingsUpdateConsumer(MAX_OPEN_JOBS_PER_NODE, this::setMaxOpenJobs);
clusterService.getClusterSettings().addSettingsUpdateConsumer(USE_AUTO_MACHINE_MEMORY_PERCENT, this::setUseAutoMemoryPercentage);
clusterService.getClusterSettings().addSettingsUpdateConsumer(MAX_ML_NODE_SIZE, this::setMaxNodeSize);
}

protected abstract String[] indicesOfInterest(Params params);
Expand Down Expand Up @@ -136,6 +139,14 @@ void setMaxOpenJobs(int maxOpenJobs) {
this.maxOpenJobs = maxOpenJobs;
}

void setUseAutoMemoryPercentage(boolean useAutoMemoryPercentage) {
this.useAutoMemoryPercentage = useAutoMemoryPercentage;
}

void setMaxNodeSize(ByteSizeValue maxNodeSize) {
this.maxNodeMemory = maxNodeSize.getBytes();
}

public Optional<PersistentTasksCustomMetadata.Assignment> checkRequiredIndices(String jobId,
ClusterState clusterState,
String... indicesOfInterest) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ private static TaskExecutor createTaskExecutor() {
MachineLearning.CONCURRENT_JOB_ALLOCATIONS,
MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_ML_NODE_SIZE,
MachineLearning.MAX_LAZY_ML_NODES,
MachineLearning.MAX_OPEN_JOBS_PER_NODE));
when(clusterService.getClusterSettings()).thenReturn(clusterSettings);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ public void setup() {
timeSupplier = System::currentTimeMillis;
ClusterSettings cSettings = new ClusterSettings(
Settings.EMPTY,
Set.of(MachineLearning.MAX_MACHINE_MEMORY_PERCENT, MachineLearning.MAX_OPEN_JOBS_PER_NODE));
Set.of(MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_OPEN_JOBS_PER_NODE,
MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT));
when(clusterService.getClusterSettings()).thenReturn(cSettings);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,13 @@ public void testValidate_givenValidJob() {

public void testGetAssignment_GivenJobThatRequiresMigration() {
ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY,
Sets.newHashSet(MachineLearning.CONCURRENT_JOB_ALLOCATIONS, MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_LAZY_ML_NODES, MachineLearning.MAX_OPEN_JOBS_PER_NODE, MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT)
Sets.newHashSet(MachineLearning.CONCURRENT_JOB_ALLOCATIONS,
MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_LAZY_ML_NODES,
MachineLearning.MAX_ML_NODE_SIZE,
MachineLearning.MAX_OPEN_JOBS_PER_NODE,
MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT
)
);
when(clusterService.getClusterSettings()).thenReturn(clusterSettings);

Expand All @@ -125,8 +130,13 @@ public void testGetAssignment_GivenJobThatRequiresMigration() {
public void testGetAssignment_GivenUnavailableIndicesWithLazyNode() {
Settings settings = Settings.builder().put(MachineLearning.MAX_LAZY_ML_NODES.getKey(), 1).build();
ClusterSettings clusterSettings = new ClusterSettings(settings,
Sets.newHashSet(MachineLearning.CONCURRENT_JOB_ALLOCATIONS, MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_LAZY_ML_NODES, MachineLearning.MAX_OPEN_JOBS_PER_NODE, MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT)
Sets.newHashSet(MachineLearning.CONCURRENT_JOB_ALLOCATIONS,
MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_LAZY_ML_NODES,
MachineLearning.MAX_ML_NODE_SIZE,
MachineLearning.MAX_OPEN_JOBS_PER_NODE,
MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT
)
);
when(clusterService.getClusterSettings()).thenReturn(clusterSettings);

Expand All @@ -150,8 +160,13 @@ public void testGetAssignment_GivenUnavailableIndicesWithLazyNode() {
public void testGetAssignment_GivenLazyJobAndNoGlobalLazyNodes() {
Settings settings = Settings.builder().put(MachineLearning.MAX_LAZY_ML_NODES.getKey(), 0).build();
ClusterSettings clusterSettings = new ClusterSettings(settings,
Sets.newHashSet(MachineLearning.CONCURRENT_JOB_ALLOCATIONS, MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_LAZY_ML_NODES, MachineLearning.MAX_OPEN_JOBS_PER_NODE, MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT)
Sets.newHashSet(MachineLearning.CONCURRENT_JOB_ALLOCATIONS,
MachineLearning.MAX_MACHINE_MEMORY_PERCENT,
MachineLearning.MAX_LAZY_ML_NODES,
MachineLearning.MAX_ML_NODE_SIZE,
MachineLearning.MAX_OPEN_JOBS_PER_NODE,
MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT
)
);
when(clusterService.getClusterSettings()).thenReturn(clusterSettings);

Expand Down