Skip to content

Commit

Permalink
move container data tagging to utils to resuse in generate recommenda…
Browse files Browse the repository at this point in the history
…tions

Signed-off-by: bharathappali <abharath@redhat.com>
  • Loading branch information
bharathappali committed Oct 1, 2024
1 parent 0b60648 commit 4aaed25
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 124 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.autotune.analyzer.services;

import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.exceptions.InvalidExperimentType;
import com.autotune.analyzer.exceptions.KruizeResponse;
import com.autotune.analyzer.experiment.ExperimentInitiator;
Expand Down Expand Up @@ -158,6 +159,8 @@ protected void doPost(HttpServletRequest request, HttpServletResponse response)
sendErrorResponse(inputData, response, e, HttpServletResponse.SC_INTERNAL_SERVER_ERROR, "Internal Server Error: " + e.getMessage());
} catch (InvalidExperimentType e) {
sendErrorResponse(inputData, response, null, HttpServletResponse.SC_BAD_REQUEST, e.getMessage());
} catch (FetchMetricsError e) {
throw new RuntimeException(e);
} finally {
if (null != timerCreateExp) {
MetricsConfig.timerCreateExp = MetricsConfig.timerBCreateExp.tag("status", statusValue).register(MetricsConfig.meterRegistry());
Expand Down
133 changes: 12 additions & 121 deletions src/main/java/com/autotune/analyzer/utils/ServiceHelpers.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.autotune.analyzer.application.ApplicationSearchSpace;
import com.autotune.analyzer.application.ApplicationServiceStack;
import com.autotune.analyzer.application.Tunable;
import com.autotune.analyzer.exceptions.FetchMetricsError;
import com.autotune.analyzer.kruizeLayer.KruizeLayer;
import com.autotune.analyzer.kruizeObject.KruizeObject;
import com.autotune.analyzer.performanceProfiles.MetricProfileCollection;
Expand All @@ -32,6 +33,7 @@
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.exceptions.DataSourceNotExist;
import com.autotune.common.k8sObjects.K8sObject;
import com.autotune.common.utils.CommonUtils;
import com.autotune.database.service.ExperimentDBService;
import com.autotune.operator.KruizeDeploymentInfo;
import com.autotune.utils.GenericRestApiClient;
Expand Down Expand Up @@ -273,10 +275,8 @@ public static boolean checkRecommendationTimestampExists(KruizeObject kruizeObje
return timestampExists;
}

public static void checkAndTagGPUWorkloads(KruizeObject kruizeObject) throws Exception {
public static void checkAndTagGPUWorkloads(KruizeObject kruizeObject) throws Exception, FetchMetricsError {
SimpleDateFormat sdf = new SimpleDateFormat(KruizeConstants.DateFormats.STANDARD_JSON_DATE_FORMAT, Locale.ROOT);
long interval_end_time_epoc = 0;
long interval_start_time_epoc = 0;
String metricProfileName = kruizeObject.getPerformanceProfile();
PerformanceProfile metricProfile = MetricProfileCollection.getInstance().getMetricProfileCollection().get(metricProfileName);
if (null == metricProfile) {
Expand Down Expand Up @@ -323,127 +323,18 @@ public static void checkAndTagGPUWorkloads(KruizeObject kruizeObject) throws Exc
// Iterate over containers
for (Map.Entry<String, ContainerData> entry : containerDataMap.entrySet()) {
ContainerData containerData = entry.getValue();
String containerName = containerData.getContainer_name();

String queryToEncode = null;

LOGGER.info("maxDateQuery: {}", maxDateQuery);
queryToEncode = maxDateQuery
.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
.replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
.replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
.replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);

String dateMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATE_ENDPOINT_WITH_QUERY,
dataSourceInfo.getUrl(),
URLEncoder.encode(queryToEncode, CHARACTER_ENCODING)
);

LOGGER.info(dateMetricsUrl);
JSONObject genericJsonObject = new GenericRestApiClient(dateMetricsUrl).fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
JsonObject jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
JsonArray resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);

if (null == resultArray || resultArray.isEmpty()) {
// Need to alert that container max duration is not detected
// Ignoring it here, as we take care of it at generate recommendations
return;
}

resultArray = resultArray.get(0)
.getAsJsonObject().getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.VALUE);
long epochTime = resultArray.get(0).getAsLong();
String timestamp = sdf.format(new Date(epochTime * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC));
Date date = sdf.parse(timestamp);
Timestamp dateTS = new Timestamp(date.getTime());
interval_end_time_epoc = dateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
- ((long) dateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
int maxDay = Terms.getMaxDays(kruizeObject.getTerms());
LOGGER.info(KruizeConstants.APIMessages.MAX_DAY, maxDay);
Timestamp startDateTS = Timestamp.valueOf(Objects.requireNonNull(dateTS).toLocalDateTime().minusDays(maxDay));
interval_start_time_epoc = startDateTS.getTime() / KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC
- ((long) startDateTS.getTimezoneOffset() * KruizeConstants.TimeConv.NO_OF_MSECS_IN_SEC);

gpuDetectionQuery = gpuDetectionQuery.replace(AnalyzerConstants.NAMESPACE_VARIABLE, namespace)
.replace(AnalyzerConstants.CONTAINER_VARIABLE, containerName)
.replace(AnalyzerConstants.MEASUREMENT_DURATION_IN_MIN_VARAIBLE, Integer.toString(measurementDurationMinutesInDouble.intValue()))
.replace(AnalyzerConstants.WORKLOAD_VARIABLE, workload)
.replace(AnalyzerConstants.WORKLOAD_TYPE_VARIABLE, workload_type);

String podMetricsUrl;
try {
podMetricsUrl = String.format(KruizeConstants.DataSourceConstants.DATASOURCE_ENDPOINT_WITH_QUERY,
dataSourceInfo.getUrl(),
URLEncoder.encode(gpuDetectionQuery, CHARACTER_ENCODING),
interval_start_time_epoc,
interval_end_time_epoc,
measurementDurationMinutesInDouble.intValue() * KruizeConstants.TimeConv.NO_OF_SECONDS_PER_MINUTE);
LOGGER.info(podMetricsUrl);
genericJsonObject = new GenericRestApiClient(podMetricsUrl).fetchMetricsJson(KruizeConstants.APIMessages.GET, "");
jsonObject = new Gson().fromJson(genericJsonObject.toString(), JsonObject.class);
resultArray = jsonObject.getAsJsonObject(KruizeConstants.JSONKeys.DATA).getAsJsonArray(KruizeConstants.DataSourceConstants.DataSourceQueryJSONKeys.RESULT);

if (null != resultArray && !resultArray.isEmpty()) {
for (JsonElement result : resultArray) {
JsonObject resultObject = result.getAsJsonObject();
JsonArray valuesArray = resultObject.getAsJsonArray(KruizeConstants.DataSourceConstants
.DataSourceQueryJSONKeys.VALUES);

for (JsonElement element : valuesArray) {
JsonArray valueArray = element.getAsJsonArray();
double value = valueArray.get(1).getAsDouble();
// TODO: Check for non-zero values to mark as GPU workload
break;
}

JsonObject metricObject = resultObject.getAsJsonObject(KruizeConstants.JSONKeys.METRIC);
String modelName = metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString();
if (null == modelName)
continue;

boolean isSupportedMig = checkIfModelIsKruizeSupportedMIG(modelName);
if (isSupportedMig) {
GPUDeviceData gpuDeviceData = new GPUDeviceData(metricObject.get(KruizeConstants.JSONKeys.MODEL_NAME).getAsString(),
metricObject.get(KruizeConstants.JSONKeys.HOSTNAME).getAsString(),
metricObject.get(KruizeConstants.JSONKeys.UUID).getAsString(),
metricObject.get(KruizeConstants.JSONKeys.DEVICE).getAsString(),
isSupportedMig);


if (null == containerData.getContainerDeviceList()) {
ContainerDeviceList containerDeviceList = new ContainerDeviceList();
containerData.setContainerDeviceList(containerDeviceList);
}
containerData.getContainerDeviceList().addDevice(AnalyzerConstants.DeviceType.GPU, gpuDeviceData);
// TODO: Currently we consider only the first mig supported GPU
return;
}
}
}
} catch (IOException | NoSuchAlgorithmException | KeyStoreException | KeyManagementException |
JsonSyntaxException e) {
throw new RuntimeException(e);
}
CommonUtils.markAcceleratorDeviceStatusToContainer(containerData,
maxDateQuery,
namespace,
workload,
workload_type,
dataSourceInfo,
kruizeObject.getTerms(),
measurementDurationMinutesInDouble,
gpuDetectionQuery);
}
}
}

public static boolean checkIfModelIsKruizeSupportedMIG(String modelName) {
if (null == modelName || modelName.isEmpty())
return false;

modelName = modelName.toUpperCase();

boolean A100_CHECK = (modelName.contains("A100") &&
(modelName.contains("40GB") || modelName.contains("80GB")));

boolean H100_CHECK = false;

if (!A100_CHECK) {
H100_CHECK = (modelName.contains("H100") && modelName.contains("80GB"));
}

return A100_CHECK || H100_CHECK;
}
}
}
Loading

0 comments on commit 4aaed25

Please sign in to comment.