diff --git a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java index 4d6b1460a..13dd02511 100644 --- a/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java +++ b/src/main/java/com/autotune/analyzer/utils/AnalyzerConstants.java @@ -196,6 +196,58 @@ public enum RegisterRecommendationModelStatus { INVALID } + public enum DeviceType { + CPU, + MEMORY, + NETWORK, + ACCELERATOR + } + + public enum DeviceParameters { + MODEL_NAME, + UUID, + HOSTNAME, + NAME, + MANUFACTURER, + DEVICE_NAME + } + + public static final class AcceleratorConstants { + private AcceleratorConstants() { + + } + + public static final class SupportedAccelerators { + private SupportedAccelerators() { + + } + public static final String A100_80_GB = "A100-80GB"; + public static final String A100_40_GB = "A100-40GB"; + public static final String H100 = "H100"; + } + + public static final class AcceleratorProfiles { + private AcceleratorProfiles () { + + } + + // A100 40GB Profiles + public static final String PROFILE_1G_5GB = "1g.5gb"; + public static final String PROFILE_1G_10GB = "1g.10gb"; + public static final String PROFILE_2G_10GB = "2g.10gb"; + public static final String PROFILE_3G_20GB = "3g.20gb"; + public static final String PROFILE_4G_20GB = "4g.20gb"; + public static final String PROFILE_7G_40GB = "7g.40gb"; + + // A100 80GB & H100 80GB Profiles + public static final String PROFILE_1G_20GB = "1g.20gb"; + public static final String PROFILE_2G_20GB = "2g.20gb"; + public static final String PROFILE_3G_40GB = "3g.40gb"; + public static final String PROFILE_4G_40GB = "4g.40gb"; + public static final String PROFILE_7G_80GB = "7g.80gb"; + } + } + public static final class ExperimentTypes { public static final String NAMESPACE_EXPERIMENT = "namespace"; public static final String CONTAINER_EXPERIMENT = "container"; diff --git a/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java b/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java new file mode 100644 index 000000000..00de9e322 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/ContainerDeviceList.java @@ -0,0 +1,144 @@ +package com.autotune.common.data.system.info.device; + +import com.autotune.analyzer.utils.AnalyzerConstants; +import com.autotune.common.data.system.info.device.accelerator.AcceleratorDeviceData; + +import java.util.ArrayList; +import java.util.HashMap; + +/** + * This class stores the device entries linked to the container + */ +public class ContainerDeviceList implements DeviceHandler, DeviceComponentDetector { + private final HashMap> deviceMap; + private boolean isAcceleratorDeviceDetected; + private boolean isCPUDeviceDetected; + private boolean isMemoryDeviceDetected; + private boolean isNetworkDeviceDetected; + + public ContainerDeviceList(){ + this.deviceMap = new HashMap>(); + this.isAcceleratorDeviceDetected = false; + // Currently setting up CPU, Memory and Network as true by default + this.isCPUDeviceDetected = true; + this.isMemoryDeviceDetected = true; + this.isNetworkDeviceDetected = true; + } + + @Override + public void addDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo) { + if (null == deviceType || null == deviceInfo) { + // TODO: Handle appropriate returns in future + return; + } + + if (deviceType == AnalyzerConstants.DeviceType.ACCELERATOR) + this.isAcceleratorDeviceDetected = true; + + // TODO: Handle multiple same entries + // Currently only first MIG is getting added so no check for existing duplicates is done + if (null == deviceMap.get(deviceType)) { + ArrayList deviceDetailsList = new ArrayList(); + deviceDetailsList.add(deviceInfo); + this.deviceMap.put(deviceType, deviceDetailsList); + } else { + this.deviceMap.get(deviceType).add(deviceInfo); + } + } + + @Override + public void removeDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo) { + if (null == deviceType || null == deviceInfo) { + // TODO: Handle appropriate returns in future + return; + } + // TODO: Need to be implemented if we need a dynamic experiment device updates + if (deviceType == AnalyzerConstants.DeviceType.ACCELERATOR) { + if (null == deviceMap.get(deviceType) || this.deviceMap.get(deviceType).isEmpty()) { + this.isAcceleratorDeviceDetected = false; + } + } + } + + @Override + public void updateDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo) { + // TODO: Need to be implemented if we need a dynamic experiment device updates + } + + /** + * Returns the Device which matches the identifier based on the device parameter passed + * @param deviceType - Type of the device Eg: CPU, Memory, Network or Accelerator + * @param matchIdentifier - String which needs to the matched + * @param deviceParameters - Parameter to search in device details list + * @return the appropriate DeviceDetails object + * + * USE CASE: To search the device based on a particular parameter, Let's say you have multiple accelerators + * to the container, you can pass the Model name as parameter and name of model to get the particular + * DeviceDetail object. + */ + @Override + public DeviceDetails getDeviceByParameter(AnalyzerConstants.DeviceType deviceType, String matchIdentifier, AnalyzerConstants.DeviceParameters deviceParameters) { + if (null == deviceType) + return null; + if (null == matchIdentifier) + return null; + if (null == deviceParameters) + return null; + if (matchIdentifier.isEmpty()) + return null; + if (!deviceMap.containsKey(deviceType)) + return null; + if (null == deviceMap.get(deviceType)) + return null; + if (deviceMap.get(deviceType).isEmpty()) + return null; + + // Todo: Need to add extractors for each device type currently implementing for GPU + if (deviceType == AnalyzerConstants.DeviceType.ACCELERATOR) { + for (DeviceDetails deviceDetails: deviceMap.get(deviceType)) { + AcceleratorDeviceData deviceData = (AcceleratorDeviceData) deviceDetails; + if (deviceParameters == AnalyzerConstants.DeviceParameters.MODEL_NAME) { + if (deviceData.getModelName().equalsIgnoreCase(matchIdentifier)) { + return deviceData; + } + } + } + } + + return null; + } + + @Override + public ArrayList getDevices(AnalyzerConstants.DeviceType deviceType) { + if (null == deviceType) + return null; + if (!deviceMap.containsKey(deviceType)) + return null; + if (null == deviceMap.get(deviceType)) + return null; + if (deviceMap.get(deviceType).isEmpty()) + return null; + + return deviceMap.get(deviceType); + } + + @Override + public boolean isAcceleratorDeviceDetected() { + return this.isAcceleratorDeviceDetected; + } + + @Override + public boolean isCPUDeviceDetected() { + return this.isCPUDeviceDetected; + } + + @Override + public boolean isMemoryDeviceDetected() { + return this.isMemoryDeviceDetected; + } + + @Override + public boolean isNetworkDeviceDetected() { + return this.isNetworkDeviceDetected; + } +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java new file mode 100644 index 000000000..249ba9c55 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceComponentDetector.java @@ -0,0 +1,8 @@ +package com.autotune.common.data.system.info.device; + +public interface DeviceComponentDetector { + public boolean isAcceleratorDeviceDetected(); + public boolean isCPUDeviceDetected(); + public boolean isMemoryDeviceDetected(); + public boolean isNetworkDeviceDetected(); +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java new file mode 100644 index 000000000..584891b60 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceDetails.java @@ -0,0 +1,7 @@ +package com.autotune.common.data.system.info.device; + +import com.autotune.analyzer.utils.AnalyzerConstants; + +public interface DeviceDetails { + public AnalyzerConstants.DeviceType getType(); +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java b/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java new file mode 100644 index 000000000..447716440 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/DeviceHandler.java @@ -0,0 +1,15 @@ +package com.autotune.common.data.system.info.device; + +import com.autotune.analyzer.utils.AnalyzerConstants; + +import java.util.ArrayList; + +public interface DeviceHandler { + public void addDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo); + public void removeDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo); + public void updateDevice(AnalyzerConstants.DeviceType deviceType, DeviceDetails deviceInfo); + public DeviceDetails getDeviceByParameter(AnalyzerConstants.DeviceType deviceType, + String matchIdentifier, + AnalyzerConstants.DeviceParameters deviceParameters); + public ArrayList getDevices(AnalyzerConstants.DeviceType deviceType); +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java new file mode 100644 index 000000000..a3a09fead --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceData.java @@ -0,0 +1,59 @@ +package com.autotune.common.data.system.info.device.accelerator; + +import com.autotune.analyzer.utils.AnalyzerConstants; + +public class AcceleratorDeviceData implements AcceleratorDeviceDetails { + private final String manufacturer; + private final String modelName; + private final String hostName; + private final String UUID; + private final String deviceName; + private boolean isMIG; + + public AcceleratorDeviceData (String modelName, String hostName, String UUID, String deviceName, boolean isMIG) { + this.manufacturer = "NVIDIA"; + this.modelName = modelName; + this.hostName = hostName; + this.UUID = UUID; + this.deviceName = deviceName; + this.isMIG = isMIG; + } + + @Override + public String getManufacturer() { + return this.manufacturer; + } + + @Override + public String getModelName() { + return modelName; + } + + @Override + public String getHostName() { + return hostName; + } + + @Override + public String getUUID() { + return UUID; + } + + @Override + public String getDeviceName() { + return deviceName; + } + + public boolean isMIG() { + return isMIG; + } + + public void setMIG(boolean isMIG) { + this.isMIG = isMIG; + } + + @Override + public AnalyzerConstants.DeviceType getType() { + return AnalyzerConstants.DeviceType.ACCELERATOR; + } +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java new file mode 100644 index 000000000..31b90ff66 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/AcceleratorDeviceDetails.java @@ -0,0 +1,11 @@ +package com.autotune.common.data.system.info.device.accelerator; + +import com.autotune.common.data.system.info.device.DeviceDetails; + +public interface AcceleratorDeviceDetails extends DeviceDetails { + public String getManufacturer(); + public String getModelName(); + public String getHostName(); + public String getUUID(); + public String getDeviceName(); +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java new file mode 100644 index 000000000..58d43d686 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorMetaDataService.java @@ -0,0 +1,103 @@ +package com.autotune.common.data.system.info.device.accelerator.metadata; + + + +import com.autotune.analyzer.utils.AnalyzerConstants; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * A service which is created to provide the respective Accelerator Profile + * based on SM and Memory requirements + * + * This service initially loads the profiles of supported Accelerators + * Currently it supports: + * NVIDIA A100 40GB + * NVIDIA A100 80GB + * NVIDIA H100 80GB + */ +public class AcceleratorMetaDataService { + private static Map> acceleratorProfilesMap; + private static AcceleratorMetaDataService acceleratorMetaDataService = null; + + /** + * + */ + private AcceleratorMetaDataService(){ + acceleratorProfilesMap = new HashMap<>(); + initializeAcceleratorProfiles(); + } + + private static void initializeAcceleratorProfiles() { + List commonProfiles = new ArrayList<>(); + // IMPORTANT: Add it in the ascending order according to GPU Core and Memory Units as we will break the loop upon getting the right one + commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_10GB, + 1.0 / 8, 1.0 / 7, 7)); + commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_20GB, + 1.0 / 4, 1.0 / 7, 4)); + commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_2G_20GB, + 2.0 / 8, 2.0 / 7, 3)); + commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_3G_40GB, + 4.0 / 8, 3.0 / 7, 2)); + commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_4G_40GB, + 4.0 / 8, 4.0 / 7, 1)); + commonProfiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_80GB, + 1.0, 1.0, 1)); + + List a100_40_gb_profiles = new ArrayList<>(); + // IMPORTANT: Add it in the ascending order according to GPU Core and Memory Units as we will break the loop upon getting the right one + a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_5GB, + 1.0 / 8, 1.0 / 7, 7)); + a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_1G_10GB, + 1.0 / 4, 1.0 / 7, 4)); + a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_2G_10GB, + 2.0 / 8, 2.0 / 7, 3)); + a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_3G_20GB, + 4.0 / 8, 3.0 / 7, 2)); + a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_4G_20GB, + 4.0 / 8, 4.0 / 7, 1)); + a100_40_gb_profiles.add(new AcceleratorProfile(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_40GB, + 1.0, 1.0, 1)); + + acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_80_GB, new ArrayList<>(commonProfiles)); + acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100, new ArrayList<>(commonProfiles)); + acceleratorProfilesMap.put(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_40_GB, new ArrayList<>(a100_40_gb_profiles)); + } + + public static AcceleratorMetaDataService getInstance() { + if(null == acceleratorMetaDataService) { + synchronized (AcceleratorMetaDataService.class) { + if (null == acceleratorMetaDataService) { + acceleratorMetaDataService = new AcceleratorMetaDataService(); + } + } + } + return acceleratorMetaDataService; + } + + public AcceleratorProfile getAcceleratorProfile(String modelName, Double requiredSmFraction, Double requiredMemoryFraction) { + if (null == modelName || null == requiredSmFraction || null == requiredMemoryFraction) { + return null; + } + modelName = modelName.strip(); + if (!modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_80_GB) + && !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100) + && !modelName.equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.A100_40_GB)) { + return null; + } + if (requiredMemoryFraction < 0.0 || requiredSmFraction < 0.0) { + return null; + } + List gpuProfiles = acceleratorProfilesMap.get(modelName); + for (AcceleratorProfile profile : gpuProfiles) { + if (profile.getMemoryFraction() >= requiredMemoryFraction && profile.getSmFraction() >= requiredSmFraction) { + // Returning the profile as the list is in ascending order + return profile; + } + } + return null; + } +} diff --git a/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java new file mode 100644 index 000000000..c0db82b50 --- /dev/null +++ b/src/main/java/com/autotune/common/data/system/info/device/accelerator/metadata/AcceleratorProfile.java @@ -0,0 +1,51 @@ +package com.autotune.common.data.system.info.device.accelerator.metadata; + +/** + * Class which is used to store the details of an accelerator profile + */ +public class AcceleratorProfile { + private final String profileName; + private final double memoryFraction; + private final double smFraction; + private final int instancesAvailable; + + /** + * Constructor to create the Accelerator Profile + * @param profileName - Name of the profile + * @param memoryFraction - Fraction of memory out of the whole accelerator memory + * @param smFraction - Fraction of Cores or Streaming Processors out if the whole accelerator cores + * @param instancesAvailable - Number of instances of a profile available on an Accelerator + */ + public AcceleratorProfile(String profileName, double memoryFraction, double smFraction, int instancesAvailable) { + this.profileName = profileName; + this.memoryFraction = memoryFraction; + this.smFraction = smFraction; + this.instancesAvailable = instancesAvailable; + } + + public String getProfileName() { + return this.profileName; + } + + public double getMemoryFraction() { + return memoryFraction; + } + + public double getSmFraction() { + return smFraction; + } + + public int getInstancesAvailable() { + return instancesAvailable; + } + + @Override + public String toString() { + return "AcceleratorProfile{" + + "profileName='" + profileName + '\'' + + ", memoryFraction=" + memoryFraction + + ", smFraction=" + smFraction + + ", instancesAvailable=" + instancesAvailable + + '}'; + } +}