From 03a81a1fa63e1b3781c857e3a623027f46773209 Mon Sep 17 00:00:00 2001 From: Gordon Byers Date: Tue, 5 Oct 2021 16:31:38 +0100 Subject: [PATCH] Add Metric Alerts to Kubernetes cluster (#68) Metric Alerts Ascii Art More Parameter metadata Role assignment naming consistency MonitoringMetricsPublisherRole used for Fast Alerting in bicep. * Added Firewall mode param * Adding auto compiled bicep json * added pre-release flag to PSRule --- .github/workflows/ByoVnetPrivateCI.yml | 1 + bicep/aksmetricalerts.bicep | 753 +++++++++++++++++ bicep/aksnetcontrib.bicep | 2 +- bicep/calcAzFwIp.bicep | 3 + bicep/compiled/main.json | 1026 +++++++++++++++++++++++- bicep/dnsZone.bicep | 4 +- bicep/main.bicep | 210 ++++- bicep/network.bicep | 12 - referencearchs.md | 1 + 9 files changed, 1931 insertions(+), 81 deletions(-) create mode 100644 bicep/aksmetricalerts.bicep diff --git a/.github/workflows/ByoVnetPrivateCI.yml b/.github/workflows/ByoVnetPrivateCI.yml index e820509ee..078621272 100644 --- a/.github/workflows/ByoVnetPrivateCI.yml +++ b/.github/workflows/ByoVnetPrivateCI.yml @@ -80,6 +80,7 @@ jobs: inputType: repository inputPath: "${{ env.ParamFilePath }}" baseline: 'Azure.GA_2021_09' + prerelease: true - name: Verify No Active Deployments in RG id: activedeps diff --git a/bicep/aksmetricalerts.bicep b/bicep/aksmetricalerts.bicep new file mode 100644 index 000000000..e6c88d794 --- /dev/null +++ b/bicep/aksmetricalerts.bicep @@ -0,0 +1,753 @@ +@description('The name of the AKS Cluster to configure the alerts on') +param clusterName string + +@description('The name of the Log Analytics workspace to log metric data to') +param logAnalyticsWorkspaceName string + +@description('The location of the Log Analytics workspace') +param logAnalyticsWorkspaceLocation string = resourceGroup().location + +@description('Select the frequency on how often the alert rule should be run. Selecting frequency smaller than granularity of datapoints grouping will result in sliding window evaluation') +@allowed([ + 'PT1M' + 'PT15M' +]) +param evalFrequency string = 'PT1M' + +@description('Create the metric alerts as either enabled or disabled') +param metricAlertsEnabled bool = true + +@description('Defines the interval over which datapoints are grouped using the aggregation type function') +@allowed([ + 'PT5M' + 'PT1H' +]) +param windowSize string = 'PT5M' + +@allowed([ + 'Critical' + 'Error' + 'Warning' + 'Informational' + 'Verbose' +]) +param alertSeverity string = 'Informational' + +var alertServerityLookup = { + 'Critical': 0 + 'Error': 1 + 'Warning': 2 + 'Informational': 3 + 'Verbose': 4 +} +var alertSeverityNumber = alertServerityLookup[alertSeverity] + +var AksResourceId = resourceId('Microsoft.ContainerService/managedClusters', clusterName) + +resource Node_CPU_utilization_high_for_clusterName_CI_1 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Node CPU utilization high for ${clusterName} CI-1' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'host' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'cpuUsagePercentage' + metricNamespace: 'Insights.Container/nodes' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 80 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'Node CPU utilization across the cluster.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Node_working_set_memory_utilization_high_for_clusterName_CI_2 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Node working set memory utilization high for ${clusterName} CI-2' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'host' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'memoryWorkingSetPercentage' + metricNamespace: 'Insights.Container/nodes' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 80 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'Node working set memory utilization across the cluster.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Jobs_completed_more_than_6_hours_ago_for_clusterName_CI_11 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Jobs completed more than 6 hours ago for ${clusterName} CI-11' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'completedJobsCount' + metricNamespace: 'Insights.Container/pods' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 0 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors completed jobs (more than 6 hours ago).' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Container_CPU_usage_high_for_clusterName_CI_9 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Container CPU usage high for ${clusterName} CI-9' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'cpuExceededPercentage' + metricNamespace: 'Insights.Container/containers' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 90 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors container CPU utilization.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Container_working_set_memory_usage_high_for_clusterName_CI_10 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Container working set memory usage high for ${clusterName} CI-10' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'memoryWorkingSetExceededPercentage' + metricNamespace: 'Insights.Container/containers' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 90 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors container working set memory utilization.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Pods_in_failed_state_for_clusterName_CI_4 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Pods in failed state for ${clusterName} CI-4' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'phase' + operator: 'Include' + values: [ + 'Failed' + ] + } + ] + metricName: 'podCount' + metricNamespace: 'Insights.Container/pods' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 0 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'Pod status monitoring.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Disk_usage_high_for_clusterName_CI_5 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Disk usage high for ${clusterName} CI-5' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'host' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'device' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'DiskUsedPercentage' + metricNamespace: 'Insights.Container/nodes' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 80 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors disk usage for all nodes and storage devices.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Nodes_in_not_ready_status_for_clusterName_CI_3 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Nodes in not ready status for ${clusterName} CI-3' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'status' + operator: 'Include' + values: [ + 'NotReady' + ] + } + ] + metricName: 'nodesCount' + metricNamespace: 'Insights.Container/nodes' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 0 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'Node status monitoring.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Containers_getting_OOM_killed_for_clusterName_CI_6 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Containers getting OOM killed for ${clusterName} CI-6' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'oomKilledContainerCount' + metricNamespace: 'Insights.Container/pods' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 0 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors number of containers killed due to out of memory (OOM) error.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Persistent_volume_usage_high_for_clusterName_CI_18 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Persistent volume usage high for ${clusterName} CI-18' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'podName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetesNamespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'pvUsageExceededPercentage' + metricNamespace: 'Insights.Container/persistentvolumes' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 80 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors persistent volume utilization.' + enabled: false + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Pods_not_in_ready_state_for_clusterName_CI_8 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Pods not in ready state for ${clusterName} CI-8' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'PodReadyPercentage' + metricNamespace: 'Insights.Container/pods' + name: 'Metric1' + operator: 'LessThan' + threshold: 80 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors for excessive pods not in the ready state.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'microsoft.containerservice/managedclusters' + windowSize: windowSize + } +} + +resource Restarting_container_count_for_clusterName_CI_7 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: 'Restarting container count for ${clusterName} CI-7' + location: 'global' + properties: { + criteria: { + allOf: [ + { + criterionType: 'StaticThresholdCriterion' + dimensions: [ + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + ] + metricName: 'restartingContainerCount' + metricNamespace: 'Insights.Container/pods' + name: 'Metric1' + operator: 'GreaterThan' + threshold: 0 + timeAggregation: 'Average' + skipMetricValidation: true + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + description: 'This alert monitors number of containers restarting across the cluster.' + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + AksResourceId + ] + severity: alertSeverityNumber + targetResourceType: 'Microsoft.ContainerService/managedClusters' + windowSize: windowSize + } +} + +resource Container_CPU_usage_violates_the_configured_threshold_for_clustername_CI_19 'microsoft.insights/metricAlerts@2018-03-01' = { + name: 'Container CPU usage violates the configured threshold for ${clusterName} CI-19' + location: 'global' + properties: { + description: 'This alert monitors container CPU usage. It uses the threshold defined in the config map.' + severity: alertSeverityNumber + enabled: true + scopes: [ + AksResourceId + ] + evaluationFrequency: evalFrequency + windowSize: windowSize + criteria: { + allOf: [ + { + threshold: 0 + name: 'Metric1' + metricNamespace: 'Insights.Container/containers' + metricName: 'cpuThresholdViolated' + dimensions: [ + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + operator: 'GreaterThan' + timeAggregation: 'Average' + skipMetricValidation: true + criterionType: 'StaticThresholdCriterion' + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + } +} + +resource Container_working_set_memory_usage_violates_the_configured_threshold_for_clustername_CI_20 'microsoft.insights/metricAlerts@2018-03-01' = { + name: 'Container working set memory usage violates the configured threshold for ${clusterName} CI-20' + location: 'global' + properties: { + description: 'This alert monitors container working set memory usage. It uses the threshold defined in the config map.' + severity: alertSeverityNumber + enabled: metricAlertsEnabled + scopes: [ + AksResourceId + ] + evaluationFrequency: evalFrequency + windowSize: windowSize + criteria: { + allOf: [ + { + threshold: 0 + name: 'Metric1' + metricNamespace: 'Insights.Container/containers' + metricName: 'memoryWorkingSetThresholdViolated' + dimensions: [ + { + name: 'controllerName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetes namespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + operator: 'GreaterThan' + timeAggregation: 'Average' + skipMetricValidation: true + criterionType: 'StaticThresholdCriterion' + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + } +} + + +resource PV_usage_violates_the_configured_threshold_for_clustername_CI_21 'microsoft.insights/metricAlerts@2018-03-01' = { + name: 'PV usage violates the configured threshold for ${clusterName} CI-21' + location: 'global' + properties: { + description: 'This alert monitors PV usage. It uses the threshold defined in the config map.' + severity: alertSeverityNumber + enabled: metricAlertsEnabled + scopes: [ + AksResourceId + ] + evaluationFrequency: evalFrequency + windowSize: windowSize + criteria: { + allOf: [ + { + threshold: 0 + name: 'Metric1' + metricNamespace: 'Insights.Container/persistentvolumes' + metricName: 'pvUsageThresholdViolated' + dimensions: [ + { + name: 'podName' + operator: 'Include' + values: [ + '*' + ] + } + { + name: 'kubernetesNamespace' + operator: 'Include' + values: [ + '*' + ] + } + ] + operator: 'GreaterThan' + timeAggregation: 'Average' + skipMetricValidation: true + criterionType: 'StaticThresholdCriterion' + } + ] + 'odata.type': 'Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria' + } + } +} + + +resource Daily_data_cap_breached_for_workspace_logworkspacename_CIQ_1_name_resource 'microsoft.insights/scheduledqueryrules@2021-02-01-preview' = { + name: 'Daily data cap breached for workspace ${logAnalyticsWorkspaceName} CIQ-1' + location: logAnalyticsWorkspaceLocation + properties: { + displayName: 'Daily data cap breached for workspace ${logAnalyticsWorkspaceName} CIQ-1' + description: 'This alert monitors daily data cap defined on a workspace and fires when the daily data cap is breached.' + severity: 1 + enabled: metricAlertsEnabled + evaluationFrequency: evalFrequency + scopes: [ + resourceId('microsoft.operationalinsights/workspaces', logAnalyticsWorkspaceName) + ] + windowSize: windowSize + autoMitigate: false + criteria: { + allOf: [ + { + query: '_LogOperation | where Operation == "Data collection Status" | where Detail contains "OverQuota"' + timeAggregation: 'Count' + operator: 'GreaterThan' + threshold: 0 + failingPeriods: { + numberOfEvaluationPeriods: 1 + minFailingPeriodsToAlert: 1 + } + } + ] + } + muteActionsDuration: 'P1D' + } +} diff --git a/bicep/aksnetcontrib.bicep b/bicep/aksnetcontrib.bicep index 16e05146f..25982a368 100644 --- a/bicep/aksnetcontrib.bicep +++ b/bicep/aksnetcontrib.bicep @@ -25,7 +25,7 @@ resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2018-11-30' exist } resource existing_vnet_cont 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = { - name: '${guid(user_identity_principalId, existingAksSubnetName)}' + name: '${guid(user_identity_principalId, existingAksSubnetName)}' scope: existingAksSubnet properties: { roleDefinitionId: networkContributorRole diff --git a/bicep/calcAzFwIp.bicep b/bicep/calcAzFwIp.bicep index 53e530011..762cdb667 100644 --- a/bicep/calcAzFwIp.bicep +++ b/bicep/calcAzFwIp.bicep @@ -1,4 +1,7 @@ // As per https://github.com/Azure/bicep/issues/2189#issuecomment-815962675 this file is being used as a UDF +// Takes a subnet range and returns the AzFirewall private Ip address + +@description('A subnet address for the Azure Firewall') param vnetFirewallSubnetAddressPrefix string var subnetOctets = split(vnetFirewallSubnetAddressPrefix,'.') diff --git a/bicep/compiled/main.json b/bicep/compiled/main.json index e5d198ecb..4e341114e 100644 --- a/bicep/compiled/main.json +++ b/bicep/compiled/main.json @@ -5,7 +5,7 @@ "_generator": { "name": "bicep", "version": "0.4.613.9944", - "templateHash": "16436444010258901712" + "templateHash": "5485829007399422839" } }, "parameters": { @@ -68,11 +68,17 @@ }, "azureKeyvaultSecretsProvider": { "type": "bool", - "defaultValue": false + "defaultValue": false, + "metadata": { + "description": "Installs the AKS KV CSI provider" + } }, "createKV": { "type": "bool", - "defaultValue": false + "defaultValue": false, + "metadata": { + "description": "Creates a Key Vault" + } }, "AKVserviceEndpointFW": { "type": "string", @@ -88,11 +94,17 @@ }, "azureFirewalls": { "type": "bool", - "defaultValue": false + "defaultValue": false, + "metadata": { + "description": "Create an Azure Firewall" + } }, "ingressApplicationGateway": { "type": "bool", - "defaultValue": false + "defaultValue": false, + "metadata": { + "description": "Create an Application Gateway" + } }, "appGWcount": { "type": "int", @@ -104,7 +116,10 @@ }, "privateIpApplicationGateway": { "type": "string", - "defaultValue": "" + "defaultValue": "", + "metadata": { + "description": "A known private ip in the Application Gateway subnet range to be allocated for internal traffic" + } }, "appgwKVIntegration": { "type": "bool", @@ -123,7 +138,18 @@ }, "appGWenableFirewall": { "type": "bool", - "defaultValue": true + "defaultValue": true, + "metadata": { + "description": "Enable the WAF Firewall, valid for WAF_v2 SKUs" + } + }, + "appGwFirewallMode": { + "type": "string", + "defaultValue": "Prevention", + "allowedValues": [ + "Prevention", + "Detection" + ] }, "dnsPrefix": { "type": "string", @@ -143,7 +169,10 @@ }, "omsagent": { "type": "bool", - "defaultValue": false + "defaultValue": false, + "metadata": { + "description": "Create, and use a new Log Analytics workspace for AKS logs" + } }, "enableAzureRBAC": { "type": "bool", @@ -244,9 +273,42 @@ "type": "string", "defaultValue": "" }, + "AksDiagCategories": { + "type": "array", + "defaultValue": [ + "cluster-autoscaler", + "kube-controller-manager", + "kube-audit-admin", + "guard" + ], + "metadata": { + "description": "Diagnostic categories to log" + } + }, + "createAksMetricAlerts": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Enable Metric Alerts" + } + }, + "AksMetricAlertMetricFrequencyModel": { + "type": "string", + "defaultValue": "Long", + "metadata": { + "description": "Which Metric polling frequency model to use" + }, + "allowedValues": [ + "Short", + "Long" + ] + }, "retentionInDays": { "type": "int", - "defaultValue": 30 + "defaultValue": 30, + "metadata": { + "description": "The Log Analytics retention period" + } } }, "functions": [], @@ -277,7 +339,7 @@ }, "appGwFirewallConfigOwasp": { "enabled": "[variables('appGWenableWafFirewall')]", - "firewallMode": "Prevention", + "firewallMode": "[parameters('appGwFirewallMode')]", "ruleSetType": "OWASP", "ruleSetVersion": "3.2", "requestBodyCheck": true, @@ -327,19 +389,32 @@ "aks_identity": { "type": "UserAssigned", "userAssignedIdentities": { - "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-{0}', parameters('resourceName')))]": {} + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-aks-{0}', parameters('resourceName')))]": {} } }, "policySetPodSecBaseline": "[resourceId('Microsoft.Authorization/policySetDefinitions', 'a8640138-9b0a-4a28-b8cb-1666c838647d')]", "buildInAKSRBACClusterAdmin": "[resourceId('Microsoft.Authorization/roleDefinitions', 'b1ff04bb-8a4e-4dc4-8eb5-8693973ce19b')]", - "aks_law_name": "[format('log-{0}', parameters('resourceName'))]" + "AlertFrequencyLookup": { + "Short": { + "evalFrequency": "PT1M", + "windowSize": "PT5M" + }, + "Long": { + "evalFrequency": "PT15M", + "windowSize": "PT1H" + } + }, + "AlertFrequency": "[variables('AlertFrequencyLookup')[parameters('AksMetricAlertMetricFrequencyModel')]]", + "aks_law_name": "[format('log-{0}', parameters('resourceName'))]", + "createLaw": "[or(or(parameters('omsagent'), variables('deployAppGw')), parameters('azureFirewalls'))]", + "MonitoringMetricsPublisherRole": "[resourceId('Microsoft.Authorization/roleDefinitions', '3913510d-42f4-4e42-8a64-420c390055eb')]" }, "resources": [ { "condition": "[variables('aks_byo_identity')]", "type": "Microsoft.ManagedIdentity/userAssignedIdentities", "apiVersion": "2018-11-30", - "name": "[format('id-{0}', parameters('resourceName'))]", + "name": "[format('id-aks-{0}', parameters('resourceName'))]", "location": "[parameters('location')]" }, { @@ -374,7 +449,7 @@ "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2021-04-01-preview", "scope": "[format('Microsoft.ContainerRegistry/registries/{0}', variables('acrName'))]", - "name": "[guid(resourceGroup().id, variables('acrName'))]", + "name": "[guid(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName'))), 'Acr', variables('AcrPullRole'))]", "properties": { "roleDefinitionId": "[variables('AcrPullRole')]", "principalType": "ServicePrincipal", @@ -431,7 +506,7 @@ "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2021-04-01-preview", "scope": "[format('Microsoft.Network/applicationGateways/{0}', variables('appgwName'))]", - "name": "[guid(resourceGroup().id, variables('appgwName'), 'appgwcont')]", + "name": "[guid(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName'))), 'Agic', variables('contributor'))]", "properties": { "roleDefinitionId": "[variables('contributor')]", "principalType": "ServicePrincipal", @@ -446,7 +521,7 @@ "condition": "[and(variables('DEPLOY_APPGW_ADDON'), variables('deployAppGw'))]", "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2021-04-01-preview", - "name": "[guid(resourceGroup().id, variables('appgwName'), 'rgread')]", + "name": "[guid(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName'))), 'Agic', variables('reader'))]", "properties": { "roleDefinitionId": "[variables('reader')]", "principalType": "ServicePrincipal", @@ -461,7 +536,7 @@ "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2021-04-01-preview", "scope": "[format('Microsoft.ManagedIdentity/userAssignedIdentities/{0}', format('id-appgw-{0}', parameters('resourceName')))]", - "name": "[guid(resourceGroup().id, variables('appgwName'), 'apidentityoperator')]", + "name": "[guid(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName'))), 'Agic', variables('managedIdentityOperator'))]", "properties": { "roleDefinitionId": "[variables('managedIdentityOperator')]", "principalType": "ServicePrincipal", @@ -499,7 +574,7 @@ "[resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))]", "[resourceId('Microsoft.Network/applicationGateways', variables('appgwName'))]", "[resourceId('Microsoft.Resources/deployments', 'network')]", - "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-{0}', parameters('resourceName')))]" + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-aks-{0}', parameters('resourceName')))]" ] }, { @@ -522,7 +597,7 @@ "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2021-04-01-preview", "scope": "[format('Microsoft.ContainerService/managedClusters/{0}', format('aks-{0}', parameters('resourceName')))]", - "name": "[guid(resourceGroup().id, 'aks_admin_role_assignment')]", + "name": "[guid(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName'))), 'aksadmin', variables('buildInAKSRBACClusterAdmin'))]", "properties": { "roleDefinitionId": "[variables('buildInAKSRBACClusterAdmin')]", "principalType": "User", @@ -533,7 +608,31 @@ ] }, { - "condition": "[or(or(parameters('omsagent'), variables('deployAppGw')), parameters('azureFirewalls'))]", + "condition": "[parameters('omsagent')]", + "type": "Microsoft.Insights/diagnosticSettings", + "apiVersion": "2021-05-01-preview", + "scope": "[format('Microsoft.ContainerService/managedClusters/{0}', format('aks-{0}', parameters('resourceName')))]", + "name": "aksDiags", + "properties": { + "copy": [ + { + "name": "logs", + "count": "[length(parameters('AksDiagCategories'))]", + "input": { + "category": "[parameters('AksDiagCategories')[copyIndex('logs')]]", + "enabled": true + } + } + ], + "workspaceId": "[resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))]" + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName')))]", + "[resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))]" + ] + }, + { + "condition": "[variables('createLaw')]", "type": "Microsoft.OperationalInsights/workspaces", "apiVersion": "2021-06-01", "name": "[variables('aks_law_name')]", @@ -542,6 +641,21 @@ "retentionInDays": "[parameters('retentionInDays')]" } }, + { + "condition": "[variables('createLaw')]", + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2021-04-01-preview", + "scope": "[format('Microsoft.ContainerService/managedClusters/{0}', format('aks-{0}', parameters('resourceName')))]", + "name": "[guid(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName'))), 'omsagent', variables('MonitoringMetricsPublisherRole'))]", + "properties": { + "roleDefinitionId": "[variables('MonitoringMetricsPublisherRole')]", + "principalId": "[reference(resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName')))).addonProfiles.omsagent.identity.objectId]", + "principalType": "ServicePrincipal" + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName')))]" + ] + }, { "condition": "[not(empty(parameters('byoAKSSubnetId')))]", "type": "Microsoft.Resources/deployments", @@ -558,10 +672,10 @@ "value": "[parameters('byoAKSSubnetId')]" }, "user_identity_principalId": { - "value": "[if(variables('aks_byo_identity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-{0}', parameters('resourceName')))).principalId, '')]" + "value": "[if(variables('aks_byo_identity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-aks-{0}', parameters('resourceName')))).principalId, '')]" }, "user_identity_name": { - "value": "[format('id-{0}', parameters('resourceName'))]" + "value": "[format('id-aks-{0}', parameters('resourceName'))]" }, "user_identity_rg": { "value": "[resourceGroup().name]" @@ -613,7 +727,7 @@ } }, "dependsOn": [ - "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-{0}', parameters('resourceName')))]" + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-aks-{0}', parameters('resourceName')))]" ] }, { @@ -640,7 +754,7 @@ "value": "[parameters('vnetAddressPrefix')]" }, "aksPrincipleId": { - "value": "[if(variables('aks_byo_identity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-{0}', parameters('resourceName')))).principalId, '')]" + "value": "[if(variables('aks_byo_identity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-aks-{0}', parameters('resourceName')))).principalId, '')]" }, "vnetAksSubnetAddressPrefix": { "value": "[parameters('vnetAksSubnetAddressPrefix')]" @@ -665,7 +779,7 @@ "_generator": { "name": "bicep", "version": "0.4.613.9944", - "templateHash": "2750499435665179669" + "templateHash": "2892736808511445969" } }, "parameters": { @@ -806,12 +920,15 @@ "_generator": { "name": "bicep", "version": "0.4.613.9944", - "templateHash": "2859783932617740105" + "templateHash": "14612576561830144908" } }, "parameters": { "vnetFirewallSubnetAddressPrefix": { - "type": "string" + "type": "string", + "metadata": { + "description": "A subnet address for the Azure Firewall" + } } }, "functions": [], @@ -851,7 +968,7 @@ } }, "dependsOn": [ - "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-{0}', parameters('resourceName')))]" + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', format('id-aks-{0}', parameters('resourceName')))]" ] }, { @@ -886,7 +1003,7 @@ "_generator": { "name": "bicep", "version": "0.4.613.9944", - "templateHash": "5223086552725847780" + "templateHash": "9789377551391732560" } }, "parameters": { @@ -915,7 +1032,7 @@ "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2020-04-01-preview", "scope": "[format('Microsoft.Network/dnsZones/{0}', parameters('dnsZoneName'))]", - "name": "[guid(resourceGroup().id, parameters('principalId'))]", + "name": "[guid(resourceGroup().id, parameters('principalId'), variables('DNSZoneContributor'))]", "properties": { "roleDefinitionId": "[variables('DNSZoneContributor')]", "principalType": "ServicePrincipal", @@ -927,7 +1044,7 @@ "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2020-04-01-preview", "scope": "[format('Microsoft.Network/privateDnsZones/{0}', parameters('dnsZoneName'))]", - "name": "[guid(resourceGroup().id, parameters('principalId'))]", + "name": "[guid(resourceGroup().id, parameters('principalId'), variables('PrivateDNSZoneContributor'))]", "properties": { "roleDefinitionId": "[variables('PrivateDNSZoneContributor')]", "principalType": "ServicePrincipal", @@ -1195,6 +1312,849 @@ "[resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))]", "[resourceId('Microsoft.Resources/deployments', 'network')]" ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2019-10-01", + "name": "aksmetricalerts", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "clusterName": { + "value": "[format('aks-{0}', parameters('resourceName'))]" + }, + "logAnalyticsWorkspaceName": { + "value": "[variables('aks_law_name')]" + }, + "metricAlertsEnabled": { + "value": "[parameters('createAksMetricAlerts')]" + }, + "evalFrequency": { + "value": "[variables('AlertFrequency').evalFrequency]" + }, + "windowSize": { + "value": "[variables('AlertFrequency').windowSize]" + }, + "alertSeverity": { + "value": "Informational" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.4.613.9944", + "templateHash": "6447993220336030841" + } + }, + "parameters": { + "clusterName": { + "type": "string", + "metadata": { + "description": "The name of the AKS Cluster to configure the alerts on" + } + }, + "logAnalyticsWorkspaceName": { + "type": "string", + "metadata": { + "description": "The name of the Log Analytics workspace to log metric data to" + } + }, + "logAnalyticsWorkspaceLocation": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The location of the Log Analytics workspace" + } + }, + "evalFrequency": { + "type": "string", + "defaultValue": "PT1M", + "allowedValues": [ + "PT1M", + "PT15M" + ], + "metadata": { + "description": "Select the frequency on how often the alert rule should be run. Selecting frequency smaller than granularity of datapoints grouping will result in sliding window evaluation" + } + }, + "metricAlertsEnabled": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Create the metric alerts as either enabled or disabled" + } + }, + "windowSize": { + "type": "string", + "defaultValue": "PT5M", + "allowedValues": [ + "PT5M", + "PT1H" + ], + "metadata": { + "description": "Defines the interval over which datapoints are grouped using the aggregation type function" + } + }, + "alertSeverity": { + "type": "string", + "defaultValue": "Informational", + "allowedValues": [ + "Critical", + "Error", + "Warning", + "Informational", + "Verbose" + ] + } + }, + "functions": [], + "variables": { + "alertServerityLookup": { + "Critical": 0, + "Error": 1, + "Warning": 2, + "Informational": 3, + "Verbose": 4 + }, + "alertSeverityNumber": "[variables('alertServerityLookup')[parameters('alertSeverity')]]", + "AksResourceId": "[resourceId('Microsoft.ContainerService/managedClusters', parameters('clusterName'))]" + }, + "resources": [ + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Node CPU utilization high for {0} CI-1', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "host", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "cpuUsagePercentage", + "metricNamespace": "Insights.Container/nodes", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 80, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "Node CPU utilization across the cluster.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Node working set memory utilization high for {0} CI-2', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "host", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "memoryWorkingSetPercentage", + "metricNamespace": "Insights.Container/nodes", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 80, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "Node working set memory utilization across the cluster.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Jobs completed more than 6 hours ago for {0} CI-11', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "completedJobsCount", + "metricNamespace": "Insights.Container/pods", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 0, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors completed jobs (more than 6 hours ago).", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Container CPU usage high for {0} CI-9', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "cpuExceededPercentage", + "metricNamespace": "Insights.Container/containers", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 90, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors container CPU utilization.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Container working set memory usage high for {0} CI-10', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "memoryWorkingSetExceededPercentage", + "metricNamespace": "Insights.Container/containers", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 90, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors container working set memory utilization.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Pods in failed state for {0} CI-4', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "phase", + "operator": "Include", + "values": [ + "Failed" + ] + } + ], + "metricName": "podCount", + "metricNamespace": "Insights.Container/pods", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 0, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "Pod status monitoring.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Disk usage high for {0} CI-5', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "host", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "device", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "DiskUsedPercentage", + "metricNamespace": "Insights.Container/nodes", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 80, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors disk usage for all nodes and storage devices.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Nodes in not ready status for {0} CI-3', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "status", + "operator": "Include", + "values": [ + "NotReady" + ] + } + ], + "metricName": "nodesCount", + "metricNamespace": "Insights.Container/nodes", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 0, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "Node status monitoring.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Containers getting OOM killed for {0} CI-6', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "oomKilledContainerCount", + "metricNamespace": "Insights.Container/pods", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 0, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors number of containers killed due to out of memory (OOM) error.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Persistent volume usage high for {0} CI-18', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "podName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetesNamespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "pvUsageExceededPercentage", + "metricNamespace": "Insights.Container/persistentvolumes", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 80, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors persistent volume utilization.", + "enabled": false, + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Pods not in ready state for {0} CI-8', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "PodReadyPercentage", + "metricNamespace": "Insights.Container/pods", + "name": "Metric1", + "operator": "LessThan", + "threshold": 80, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors for excessive pods not in the ready state.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "microsoft.containerservice/managedclusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Restarting container count for {0} CI-7', parameters('clusterName'))]", + "location": "global", + "properties": { + "criteria": { + "allOf": [ + { + "criterionType": "StaticThresholdCriterion", + "dimensions": [ + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "metricName": "restartingContainerCount", + "metricNamespace": "Insights.Container/pods", + "name": "Metric1", + "operator": "GreaterThan", + "threshold": 0, + "timeAggregation": "Average", + "skipMetricValidation": true + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + }, + "description": "This alert monitors number of containers restarting across the cluster.", + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "severity": "[variables('alertSeverityNumber')]", + "targetResourceType": "Microsoft.ContainerService/managedClusters", + "windowSize": "[parameters('windowSize')]" + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Container CPU usage violates the configured threshold for {0} CI-19', parameters('clusterName'))]", + "location": "global", + "properties": { + "description": "This alert monitors container CPU usage. It uses the threshold defined in the config map.", + "severity": "[variables('alertSeverityNumber')]", + "enabled": true, + "scopes": [ + "[variables('AksResourceId')]" + ], + "evaluationFrequency": "[parameters('evalFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "allOf": [ + { + "threshold": 0, + "name": "Metric1", + "metricNamespace": "Insights.Container/containers", + "metricName": "cpuThresholdViolated", + "dimensions": [ + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "GreaterThan", + "timeAggregation": "Average", + "skipMetricValidation": true, + "criterionType": "StaticThresholdCriterion" + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + } + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('Container working set memory usage violates the configured threshold for {0} CI-20', parameters('clusterName'))]", + "location": "global", + "properties": { + "description": "This alert monitors container working set memory usage. It uses the threshold defined in the config map.", + "severity": "[variables('alertSeverityNumber')]", + "enabled": "[parameters('metricAlertsEnabled')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "evaluationFrequency": "[parameters('evalFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "allOf": [ + { + "threshold": 0, + "name": "Metric1", + "metricNamespace": "Insights.Container/containers", + "metricName": "memoryWorkingSetThresholdViolated", + "dimensions": [ + { + "name": "controllerName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetes namespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "GreaterThan", + "timeAggregation": "Average", + "skipMetricValidation": true, + "criterionType": "StaticThresholdCriterion" + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + } + } + }, + { + "type": "Microsoft.Insights/metricAlerts", + "apiVersion": "2018-03-01", + "name": "[format('PV usage violates the configured threshold for {0} CI-21', parameters('clusterName'))]", + "location": "global", + "properties": { + "description": "This alert monitors PV usage. It uses the threshold defined in the config map.", + "severity": "[variables('alertSeverityNumber')]", + "enabled": "[parameters('metricAlertsEnabled')]", + "scopes": [ + "[variables('AksResourceId')]" + ], + "evaluationFrequency": "[parameters('evalFrequency')]", + "windowSize": "[parameters('windowSize')]", + "criteria": { + "allOf": [ + { + "threshold": 0, + "name": "Metric1", + "metricNamespace": "Insights.Container/persistentvolumes", + "metricName": "pvUsageThresholdViolated", + "dimensions": [ + { + "name": "podName", + "operator": "Include", + "values": [ + "*" + ] + }, + { + "name": "kubernetesNamespace", + "operator": "Include", + "values": [ + "*" + ] + } + ], + "operator": "GreaterThan", + "timeAggregation": "Average", + "skipMetricValidation": true, + "criterionType": "StaticThresholdCriterion" + } + ], + "odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria" + } + } + }, + { + "type": "Microsoft.Insights/scheduledQueryRules", + "apiVersion": "2021-02-01-preview", + "name": "[format('Daily data cap breached for workspace {0} CIQ-1', parameters('logAnalyticsWorkspaceName'))]", + "location": "[parameters('logAnalyticsWorkspaceLocation')]", + "properties": { + "displayName": "[format('Daily data cap breached for workspace {0} CIQ-1', parameters('logAnalyticsWorkspaceName'))]", + "description": "This alert monitors daily data cap defined on a workspace and fires when the daily data cap is breached.", + "severity": 1, + "enabled": "[parameters('metricAlertsEnabled')]", + "evaluationFrequency": "[parameters('evalFrequency')]", + "scopes": [ + "[resourceId('microsoft.operationalinsights/workspaces', parameters('logAnalyticsWorkspaceName'))]" + ], + "windowSize": "[parameters('windowSize')]", + "autoMitigate": false, + "criteria": { + "allOf": [ + { + "query": "_LogOperation | where Operation == \"Data collection Status\" | where Detail contains \"OverQuota\"", + "timeAggregation": "Count", + "operator": "GreaterThan", + "threshold": 0, + "failingPeriods": { + "numberOfEvaluationPeriods": 1, + "minFailingPeriodsToAlert": 1 + } + } + ] + }, + "muteActionsDuration": "P1D" + } + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', format('aks-{0}', parameters('resourceName')))]", + "[resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))]" + ] } ], "outputs": { @@ -1208,11 +2168,11 @@ }, "LogAnalyticsName": { "type": "string", - "value": "[if(or(or(parameters('omsagent'), variables('deployAppGw')), parameters('azureFirewalls')), variables('aks_law_name'), '')]" + "value": "[if(variables('createLaw'), variables('aks_law_name'), '')]" }, "LogAnalyticsGuid": { "type": "string", - "value": "[if(or(or(parameters('omsagent'), variables('deployAppGw')), parameters('azureFirewalls')), reference(resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))).customerId, '')]" + "value": "[if(variables('createLaw'), reference(resourceId('Microsoft.OperationalInsights/workspaces', variables('aks_law_name'))).customerId, '')]" } } } \ No newline at end of file diff --git a/bicep/dnsZone.bicep b/bicep/dnsZone.bicep index 3f9333669..de972080c 100644 --- a/bicep/dnsZone.bicep +++ b/bicep/dnsZone.bicep @@ -14,7 +14,7 @@ resource privateDns 'Microsoft.Network/privateDnsZones@2020-06-01' existing = if var DNSZoneContributor = resourceId('Microsoft.Authorization/roleDefinitions', 'befefa01-2a29-4197-83a8-272ff33ce314') resource dnsContributor 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = if (!isPrivate) { scope: dns - name: guid(resourceGroup().id, principalId) + name: guid(resourceGroup().id, principalId, DNSZoneContributor) properties: { roleDefinitionId: DNSZoneContributor principalType: 'ServicePrincipal' @@ -25,7 +25,7 @@ resource dnsContributor 'Microsoft.Authorization/roleAssignments@2020-04-01-prev var PrivateDNSZoneContributor = resourceId('Microsoft.Authorization/roleDefinitions', 'b12aa53e-6015-4669-85d0-8515ebb3ae7f') resource privateDnsContributor 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = if (isPrivate) { scope: privateDns - name: guid(resourceGroup().id, principalId) + name: guid(resourceGroup().id, principalId, PrivateDNSZoneContributor) properties: { roleDefinitionId: PrivateDNSZoneContributor principalType: 'ServicePrincipal' diff --git a/bicep/main.bicep b/bicep/main.bicep index 0f7427838..fcdbe83cc 100644 --- a/bicep/main.bicep +++ b/bicep/main.bicep @@ -14,16 +14,16 @@ Resource sections 5. Firewall 6. Application Gateway 7. AKS -8. Log Analytics +8. Monitoring / Log Analytics */ -/*_ _ ______ _________ ______ _____ _ _______ _ _ _____ - | \ | | ____|__ __\ \ / / __ \| __ \| |/ /_ _| \ | |/ ____| - | \| | |__ | | \ \ /\ / / | | | |__) | ' / | | | \| | | __ - | . ` | __| | | \ \/ \/ /| | | | _ /| < | | | . ` | | |_ | - | |\ | |____ | | \ /\ / | |__| | | \ \| . \ _| |_| |\ | |__| | - |_| \_|______| |_| \/ \/ \____/|_| \_\_|\_\_____|_| \_|\_____|*/ +/*.__ __. _______ .___________.____ __ ____ ______ .______ __ ___ __ .__ __. _______ +| \ | | | ____|| |\ \ / \ / / / __ \ | _ \ | |/ / | | | \ | | / _____| +| \| | | |__ `---| |----` \ \/ \/ / | | | | | |_) | | ' / | | | \| | | | __ +| . ` | | __| | | \ / | | | | | / | < | | | . ` | | | |_ | +| |\ | | |____ | | \ /\ / | `--' | | |\ \----.| . \ | | | |\ | | |__| | +|__| \__| |_______| |__| \__/ \__/ \______/ | _| `._____||__|\__\ |__| |__| \__| \______| */ //Networking can either be one of: custom / byo / default @description('Are you providing your own vNet CIDR blocks') @@ -38,7 +38,7 @@ param byoAGWSubnetId string = '' //--- Custom or BYO networking requires BYO AKS User Identity var aks_byo_identity = custom_vnet || !empty(byoAKSSubnetId) resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2018-11-30' = if (aks_byo_identity) { - name: 'id-${resourceName}' + name: 'id-aks-${resourceName}' location: location } @@ -97,8 +97,14 @@ var aksSubnetId = custom_vnet ? network.outputs.aksSubnetId : byoAKSSubnetId var appGwSubnetId = ingressApplicationGateway ? (custom_vnet ? network.outputs.appGwSubnetId : byoAGWSubnetId) : '' -// ----------------------------------------------------------------------- If DNS Zone -// will be solved with 'existing' https://github.com/Azure/bicep/issues/258 + + +/*______ .__ __. _______. ________ ______ .__ __. _______ _______. +| \ | \ | | / | | / / __ \ | \ | | | ____| / | +| .--. || \| | | (----` `---/ / | | | | | \| | | |__ | (----` +| | | || . ` | \ \ / / | | | | | . ` | | __| \ \ +| '--' || |\ | .----) | / /----.| `--' | | |\ | | |____ .----) | +|_______/ |__| \__| |_______/ /________| \______/ |__| \__| |_______||_______/ */ param dnsZoneId string = '' var dnsZoneRg = !empty(dnsZoneId) ? split(dnsZoneId, '/')[4] : '' @@ -116,12 +122,21 @@ module dnsZone './dnsZone.bicep' = if (!empty(dnsZoneId)) { } } -//---------------------------------------------------------------------------------- AKV +/*__ __ _______ ____ ____ ____ ____ ___ __ __ __ .___________. +| |/ / | ____|\ \ / / \ \ / / / \ | | | | | | | | +| ' / | |__ \ \/ / \ \/ / / ^ \ | | | | | | `---| |----` +| < | __| \_ _/ \ / / /_\ \ | | | | | | | | +| . \ | |____ | | \ / / _____ \ | `--' | | `----. | | +|__|\__\ |_______| |__| \__/ /__/ \__\ \______/ |_______| |__| */ +@description('Installs the AKS KV CSI provider') param azureKeyvaultSecretsProvider bool = false //This is a preview feature +@description('Creates a Key Vault') param createKV bool = false + param AKVserviceEndpointFW string = '' // either IP, or 'vnetonly' + var akvName = 'kv-${replace(resourceName, '-', '')}' resource kv 'Microsoft.KeyVault/vaults@2021-06-01-preview' = if (createKV) { @@ -186,7 +201,13 @@ resource kv 'Microsoft.KeyVault/vaults@2021-06-01-preview' = if (createKV) { } : {}) } -//---------------------------------------------------------------------------------- ACR +/* ___ ______ .______ + / \ / | | _ \ + / ^ \ | ,----' | |_) | + / /_\ \ | | | / + / _____ \ __| `----. __ | |\ \----. __ +/__/ \__\ (__)\______|(__)| _| `._____|(__)*/ + param registries_sku string = '' param ACRserviceEndpointFW string = '' // either IP, or 'vnetonly' @@ -218,19 +239,31 @@ resource acr 'Microsoft.ContainerRegistry/registries@2021-06-01-preview' = if (! } var AcrPullRole = resourceId('Microsoft.Authorization/roleDefinitions', '7f951dda-4ed3-4680-a7ca-43fe172d538d') -// New way of setting scope https://docs.microsoft.com/en-us/azure/azure-resource-manager/templates/scope-extension-resources +var KubeletObjectId = any(aks.properties.identityProfile.kubeletidentity).objectId + resource aks_acr_pull 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (!empty(registries_sku)) { scope: acr // Use when specifying a scope that is different than the deployment scope - name: guid(resourceGroup().id, acrName) + name: '${guid(aks.id, 'Acr' , AcrPullRole)}' properties: { roleDefinitionId: AcrPullRole principalType: 'ServicePrincipal' - principalId: any(aks.properties.identityProfile.kubeletidentity).objectId + principalId: KubeletObjectId } + dependsOn: [ + aks + ] } -//---------------------------------------------------------------------------------- Firewall +/*______ __ .______ _______ ____ __ ____ ___ __ __ +| ____|| | | _ \ | ____|\ \ / \ / / / \ | | | | +| |__ | | | |_) | | |__ \ \/ \/ / / ^ \ | | | | +| __| | | | / | __| \ / / /_\ \ | | | | +| | | | | |\ \----.| |____ \ /\ / / _____ \ | `----.| `----. +|__| |__| | _| `._____||_______| \__/ \__/ /__/ \__\ |_______||_______|*/ + +@description('Create an Azure Firewall') param azureFirewalls bool = false + module firewall './firewall.bicep' = if (azureFirewalls && custom_vnet) { name: 'firewall' params: { @@ -242,11 +275,22 @@ module firewall './firewall.bicep' = if (azureFirewalls && custom_vnet) { } } -//---------------------------------------------------------------------------------- AppGateway +/* ___ .______ .______ _______ ____ __ ____ + / \ | _ \ | _ \ / _____|\ \ / \ / / + / ^ \ | |_) | | |_) | | | __ \ \/ \/ / + / /_\ \ | ___/ | ___/ | | |_ | \ / + / _____ \ | | | | __ | |__| | \ /\ / __ +/__/ \__\ | _| | _| (__) \______| \__/ \__/ (__)*/ + +@description('Create an Application Gateway') param ingressApplicationGateway bool = false + param appGWcount int = 2 param appGWmaxCount int = 0 + +@description('A known private ip in the Application Gateway subnet range to be allocated for internal traffic') param privateIpApplicationGateway string = '' + param appgwKVIntegration bool = false @allowed([ @@ -255,6 +299,8 @@ param appgwKVIntegration bool = false ]) @description('The SKU for AppGw') param appGWsku string = 'WAF_v2' + +@description('Enable the WAF Firewall, valid for WAF_v2 SKUs') param appGWenableFirewall bool = true var deployAppGw = ingressApplicationGateway && (custom_vnet || !empty(byoAGWSubnetId)) @@ -263,7 +309,7 @@ var appGWenableWafFirewall = appGWsku=='Standard_v2' ? false : appGWenableFirewa // If integrating App Gateway with KeyVault, create a Identity App Gateway will use to access keyvault // 'identity' is always created (adding: "|| deployAppGw") until this is fixed: // https://github.com/Azure/bicep/issues/387#issuecomment-885671296 -resource appGwIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2018-11-30' = if ( /* appgwKVIntegration && */deployAppGw) { +resource appGwIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2018-11-30' = if (deployAppGw) { name: 'id-appgw-${resourceName}' location: location } @@ -304,9 +350,15 @@ var frontendPrivateIpConfig = { name: 'appGatewayPrivateIP' } +@allowed([ + 'Prevention' + 'Detection' +]) +param appGwFirewallMode string = 'Prevention' + var appGwFirewallConfigOwasp = { enabled: appGWenableWafFirewall - firewallMode: 'Prevention' + firewallMode: appGwFirewallMode ruleSetType: 'OWASP' ruleSetVersion: '3.2' requestBodyCheck: true @@ -402,8 +454,7 @@ var appgwProperties = union({ } } : {}) -// 'identity' is always set until this is fixed: -// https://github.com/Azure/bicep/issues/387#issuecomment-885671296 +// 'identity' is always set until this is fixed: https://github.com/Azure/bicep/issues/387#issuecomment-885671296 resource appgw 'Microsoft.Network/applicationGateways@2021-02-01' = if (deployAppGw) { name: appgwName location: location @@ -424,7 +475,7 @@ var contributor = resourceId('Microsoft.Authorization/roleDefinitions', 'b24988a // AGIC's identity requires "Contributor" permission over Application Gateway. resource appGwAGICContrib 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (DEPLOY_APPGW_ADDON && deployAppGw) { scope: appgw - name: guid(resourceGroup().id, appgwName, 'appgwcont') + name: '${guid(aks.id, 'Agic', contributor)}' properties: { roleDefinitionId: contributor principalType: 'ServicePrincipal' @@ -436,7 +487,7 @@ resource appGwAGICContrib 'Microsoft.Authorization/roleAssignments@2021-04-01-pr var reader = resourceId('Microsoft.Authorization/roleDefinitions', 'acdd72a7-3385-48ef-bd42-f606fba81ae7') resource appGwAGICRGReader 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (DEPLOY_APPGW_ADDON && deployAppGw) { scope: resourceGroup() - name: guid(resourceGroup().id, appgwName, 'rgread') + name: '${guid(aks.id, 'Agic', reader)}' properties: { roleDefinitionId: reader principalType: 'ServicePrincipal' @@ -446,9 +497,9 @@ resource appGwAGICRGReader 'Microsoft.Authorization/roleAssignments@2021-04-01-p // AGIC's identity requires "Managed Identity Operator" permission over the user assigned identity of Application Gateway. var managedIdentityOperator = resourceId('Microsoft.Authorization/roleDefinitions', 'f1a07417-d97a-45cb-824c-7a7467783830') -resource appGwAGICMIOp 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (DEPLOY_APPGW_ADDON && /* appgwKVIntegration && */ deployAppGw) { +resource appGwAGICMIOp 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (DEPLOY_APPGW_ADDON && deployAppGw) { scope: appGwIdentity - name: guid(resourceGroup().id, appgwName, 'apidentityoperator') + name: '${guid(aks.id, 'Agic', managedIdentityOperator)}' properties: { roleDefinitionId: managedIdentityOperator principalType: 'ServicePrincipal' @@ -456,7 +507,7 @@ resource appGwAGICMIOp 'Microsoft.Authorization/roleAssignments@2021-04-01-previ } } -// ------------------------------------------------------------------ AppGW Diagnostics +// AppGW Diagnostics var diagProperties = { workspaceId: workspaceId logs: [ @@ -486,11 +537,19 @@ resource appgw_Diag 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = output ApplicationGatewayName string = deployAppGw ? appgw.name : '' -//---------------------------------------------------------------------------------- AKS +/*_ ___ __ __ .______ _______ .______ .__ __. _______ .___________. _______ _______. +| |/ / | | | | | _ \ | ____|| _ \ | \ | | | ____|| || ____| / | +| ' / | | | | | |_) | | |__ | |_) | | \| | | |__ `---| |----`| |__ | (----` +| < | | | | | _ < | __| | / | . ` | | __| | | | __| \ \ +| . \ | `--' | | |_) | | |____ | |\ \----.| |\ | | |____ | | | |____ .----) | +|__|\__\ \______/ |______/ |_______|| _| `._____||__| \__| |_______| |__| |_______||_______/ */ + param dnsPrefix string = '${resourceName}-dns' param kubernetesVersion string = '1.20.9' param enable_aad bool = false param aad_tenant_id string = '' + +@description('Create, and use a new Log Analytics workspace for AKS logs') param omsagent bool = false param enableAzureRBAC bool = false @@ -498,8 +557,11 @@ param upgradeChannel string = '' param osDiskType string = 'Ephemeral' param agentVMSize string = 'Standard_DS2_v2' param osDiskSizeGB int = 0 + param agentCount int = 3 param agentCountMax int = 0 +var autoScale = agentCountMax > agentCount + param maxPods int = 30 param networkPlugin string = 'azure' param networkPolicy string = '' @@ -516,8 +578,6 @@ param serviceCidr string = '10.0.0.0/16' param dnsServiceIP string = '10.0.0.10' param dockerBridgeCidr string = '172.17.0.1/16' -var autoScale = agentCountMax > agentCount - param JustUseSystemPool bool = false @allowed([ @@ -728,7 +788,7 @@ param adminprincipleid string = '' var buildInAKSRBACClusterAdmin = resourceId('Microsoft.Authorization/roleDefinitions', 'b1ff04bb-8a4e-4dc4-8eb5-8693973ce19b') resource aks_admin_role_assignment 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (enableAzureRBAC && !empty(adminprincipleid)) { scope: aks // Use when specifying a scope that is different than the deployment scope - name: guid(resourceGroup().id, 'aks_admin_role_assignment') + name: '${guid(aks.id, 'aksadmin', buildInAKSRBACClusterAdmin)}' properties: { roleDefinitionId: buildInAKSRBACClusterAdmin principalType: 'User' @@ -746,16 +806,100 @@ resource gitops 'Microsoft.KubernetesConfiguration/sourceControlConfigurations@2 } */ +/*__ ___. ______ .__ __. __ .___________. ______ .______ __ .__ __. _______ +| \/ | / __ \ | \ | | | | | | / __ \ | _ \ | | | \ | | / _____| +| \ / | | | | | | \| | | | `---| |----`| | | | | |_) | | | | \| | | | __ +| |\/| | | | | | | . ` | | | | | | | | | | / | | | . ` | | | |_ | +| | | | | `--' | | |\ | | | | | | `--' | | |\ \----.| | | |\ | | |__| | +|__| |__| \______/ |__| \__| |__| |__| \______/ | _| `._____||__| |__| \__| \______| */ + + +@description('Diagnostic categories to log') +param AksDiagCategories array = [ + 'cluster-autoscaler' + 'kube-controller-manager' + 'kube-audit-admin' + 'guard' +] + +resource AksDiags 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = if (omsagent) { + name: 'aksDiags' + scope: aks + properties: { + workspaceId: aks_law.id + logs: [for aksDiagCategory in AksDiagCategories: { + category: aksDiagCategory + enabled: true + }] + } +} + +@description('Enable Metric Alerts') +param createAksMetricAlerts bool = true + +@allowed([ + 'Short' + 'Long' +]) +@description('Which Metric polling frequency model to use') +param AksMetricAlertMetricFrequencyModel string = 'Long' + +var AlertFrequencyLookup = { + 'Short': { + evalFrequency: 'PT1M' + windowSize: 'PT5M' + } + 'Long': { + evalFrequency: 'PT15M' + windowSize: 'PT1H' + } +} +var AlertFrequency = AlertFrequencyLookup[AksMetricAlertMetricFrequencyModel] + +module aksmetricalerts './aksmetricalerts.bicep' = { + name: 'aksmetricalerts' + scope: resourceGroup() + params: { + clusterName: aks.name + logAnalyticsWorkspaceName: aks_law.name + metricAlertsEnabled: createAksMetricAlerts + evalFrequency: AlertFrequency.evalFrequency + windowSize: AlertFrequency.windowSize + alertSeverity: 'Informational' + } +} + //---------------------------------------------------------------------------------- Container Insights +@description('The Log Analytics retention period') param retentionInDays int = 30 + var aks_law_name = 'log-${resourceName}' -resource aks_law 'Microsoft.OperationalInsights/workspaces@2021-06-01' = if (omsagent || deployAppGw || azureFirewalls) { + +var createLaw = (omsagent || deployAppGw || azureFirewalls) + +resource aks_law 'Microsoft.OperationalInsights/workspaces@2021-06-01' = if (createLaw) { name: aks_law_name location: location properties: { retentionInDays: retentionInDays } } -output LogAnalyticsName string = (omsagent || deployAppGw || azureFirewalls) ? aks_law.name : '' -output LogAnalyticsGuid string = (omsagent || deployAppGw || azureFirewalls) ? aks_law.properties.customerId : '' + +//This role assignment enables AKS->LA Fast Alerting experience +var MonitoringMetricsPublisherRole = resourceId('Microsoft.Authorization/roleDefinitions', '3913510d-42f4-4e42-8a64-420c390055eb') +resource FastAlertingRole_Aks_Law 'Microsoft.Authorization/roleAssignments@2021-04-01-preview' = if (createLaw) { + scope: aks + name: '${guid(aks.id, 'omsagent', MonitoringMetricsPublisherRole)}' + properties: { + roleDefinitionId: MonitoringMetricsPublisherRole + principalId: aks.properties.addonProfiles.omsagent.identity.objectId + principalType: 'ServicePrincipal' + } +} + + +output LogAnalyticsName string = (createLaw) ? aks_law.name : '' +output LogAnalyticsGuid string = (createLaw) ? aks_law.properties.customerId : '' + +//ACSCII Art link : https://textkool.com/en/ascii-art-generator?hl=default&vl=default&font=Star%20Wars&text=changeme diff --git a/bicep/network.bicep b/bicep/network.bicep index 97a1f63c3..3d3ff77b1 100644 --- a/bicep/network.bicep +++ b/bicep/network.bicep @@ -105,15 +105,3 @@ resource aks_vnet_cont 'Microsoft.Network/virtualNetworks/subnets/providers/role principalType: 'ServicePrincipal' } } - -/* -resource aks_vnet_cont 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = if (!empty(aksPrincipleId)) { - scope: existingAKSSubnet - name: guid(resourceGroup().id, aksPrincipleId) - properties: { - roleDefinitionId: networkContributorRole - principalId: aksPrincipleId - principalType: 'ServicePrincipal' - } -} -*/ diff --git a/referencearchs.md b/referencearchs.md index 6324a9e44..272c5d6ce 100644 --- a/referencearchs.md +++ b/referencearchs.md @@ -29,3 +29,4 @@ When the AKS Baseline is updated, changes are evaluated and rolled into this pro 1. Networking. Hub/Spoke networks typically already exist, and tightly bundling with Kubernetes doesn't work well here. BYO subnets are supported. 1. AppGw Public Listener. AppGw is the WAF ingress point for inbound internet traffic, however private listeners are also valid for fully private environments. 1. Cluster SLA. Is defaulted to off in interests of a more cost optimised default configuration, a parameter can be provided to opt in for the paid SLA. +1. Monitoring Alerts. Parametrised metric analysis frequency, created two presets (1 as per baseline, 2 less frequent), set default to be much less frequent. Added extra monitoring alerts as per in-cluster suggestions.