Source code for saltext.vmware.modules.cluster_ha
# Copyright 2021 VMware, Inc.
# SPDX-License-Identifier: Apache-2.0
import logging
import salt.exceptions
import saltext.vmware.utils.cluster as utils_cluster
import saltext.vmware.utils.datacenter as utils_datacenter
import saltext.vmware.utils.esxi as utils_esxi
from saltext.vmware.utils.connect import get_service_instance
log = logging.getLogger(__name__)
try:
from pyVmomi import vim
HAS_PYVMOMI = True
except ImportError:
HAS_PYVMOMI = False
__virtualname__ = "vmware_cluster_ha"
__proxyenabled__ = ["vmware_cluster_ha"]
def __virtual__():
if not HAS_PYVMOMI:
return False, "Unable to import pyVmomi module."
return __virtualname__
def _set_slot_based_admission_control_params(cluster_spec, admission_control_policy):
"""
Set slot based admission control params
"""
cluster_spec.dasConfig.admissionControlPolicy = (
vim.cluster.FailoverLevelAdmissionControlPolicy()
)
cluster_spec.dasConfig.admissionControlPolicy.failoverLevel = admission_control_policy.get(
"slot_based_admission_control", {}
).get("failover_level")
cluster_spec.dasConfig.admissionControlPolicy.resourceReductionToToleratePercent = (
admission_control_policy.get("slot_based_admission_control", {}).get(
"resource_reduction_to_tolerate_percent"
)
)
cluster_spec.dasConfig.admissionControlEnabled = True
def _set_failover_host_admission_control_params(
cluster_spec, admission_control_policy, service_instance, datacenter, cluster
):
"""
Set failover host admission control params
"""
cluster_spec.dasConfig.admissionControlPolicy = vim.cluster.FailoverHostAdmissionControlPolicy()
hosts = utils_esxi.get_hosts(
service_instance=service_instance,
datacenter_name=datacenter,
cluster_name=cluster,
host_names=admission_control_policy.get("failover_host_admission_control", {}).get(
"failover_hosts"
),
)
cluster_spec.dasConfig.admissionControlPolicy.failoverHosts = hosts
cluster_spec.dasConfig.admissionControlPolicy.failoverLevel = admission_control_policy.get(
"failover_host_admission_control", {}
).get("failover_level")
cluster_spec.dasConfig.admissionControlPolicy.resourceReductionToToleratePercent = (
admission_control_policy.get("failover_host_admission_control", {}).get(
"resource_reduction_to_tolerate_percent"
)
)
cluster_spec.dasConfig.admissionControlEnabled = True
def _set_reservation_based_admission_control_params(cluster_spec, admission_control_policy):
"""
Set reservation based admission control params
"""
cluster_spec.dasConfig.admissionControlPolicy = (
vim.cluster.FailoverResourcesAdmissionControlPolicy()
)
cluster_spec.dasConfig.admissionControlPolicy.failoverLevel = admission_control_policy.get(
"reservation_based_admission_control", {}
).get("failover_level")
autocompute_percentages = admission_control_policy.get(
"reservation_based_admission_control", {}
).get("autocompute_percentages")
cluster_spec.dasConfig.admissionControlPolicy.autoComputePercentages = autocompute_percentages
if not autocompute_percentages:
cluster_spec.dasConfig.admissionControlPolicy.cpuFailoverResourcesPercent = (
admission_control_policy.get("reservation_based_admission_control", {}).get(
"cpu_failover_resources_percent"
)
)
cluster_spec.dasConfig.admissionControlPolicy.memoryFailoverResourcesPercent = (
admission_control_policy.get("reservation_based_admission_control", {}).get(
"memory_failover_resources_percent"
)
)
cluster_spec.dasConfig.admissionControlPolicy.resourceReductionToToleratePercent = (
admission_control_policy.get("reservation_based_admission_control", {}).get(
"resource_reduction_to_tolerate_percent"
)
)
cluster_spec.dasConfig.admissionControlEnabled = True
def _set_admission_control_params(
cluster_spec, admission_control_policy, service_instance, datacenter, cluster
):
"""
Set admission control params
"""
cluster_spec.dasConfig.admissionControlEnabled = False
if "slot_based_admission_control" in admission_control_policy:
_set_slot_based_admission_control_params(cluster_spec, admission_control_policy)
elif "failover_host_admission_control" in admission_control_policy:
_set_failover_host_admission_control_params(
cluster_spec, admission_control_policy, service_instance, datacenter, cluster
)
elif "reservation_based_admission_control" in admission_control_policy:
_set_reservation_based_admission_control_params(cluster_spec, admission_control_policy)
[docs]def configure(
cluster,
datacenter,
enable=False,
host_monitoring=vim.cluster.DasConfigInfo.ServiceState.enabled,
vm_monitoring=vim.cluster.DasConfigInfo.VmMonitoringState.vmMonitoringDisabled,
vm_component_protecting=vim.cluster.DasConfigInfo.ServiceState.disabled,
vm_min_up_time=120,
vm_max_failure_window=-1,
vm_max_failures=3,
vm_failure_interval=30,
isolation_response=vim.cluster.DasVmSettings.IsolationResponse.powerOff,
restart_priority=vim.cluster.DasVmSettings.RestartPriority.medium,
restart_priority_timeout=120,
enable_apd_timeout_for_hosts=False,
vm_reaction_on_apd_cleared="none",
vm_storage_protection_for_apd="warning",
vm_storage_protection_for_pdl="warning",
vm_terminate_delay_for_apd_sec=180,
admission_control_policy=None,
advanced_options=None,
service_instance=None,
):
"""
Configure HA for a given cluster
Supported proxies: esxcluster
cluster
The cluster name
datacenter
The datacenter name to which the cluster belongs
enable
Enable HA for the cluster
host_monitoring
Determines whether HA restarts virtual machines after a host fails. Valid values - enabled, disabled. Default - enabled.
vm_monitoring
Specifies the level of HA Virtual Machine Health Monitoring Service. Valid values - vmAndAppMonitoring, vmMonitoringDisabled
and vmMonitoringOnly. Default - vmMonitoringDisabled.
vm_component_protecting
Indicates if vSphere HA VM Component Protection service is enabled. Valid values - enabled, disabled. Default - disabled.
vm_min_up_time
The number of seconds for the virtual machine's heartbeats to stabilize after the virtual machine has been powered on.
This time should include the guest operating system boot-up time. The virtual machine monitoring will begin only after this period.
Default - 120 seconds.
vm_max_failure_window
The number of seconds for the window during which up to maxFailures resets can occur before automated responses stop.
If set to -1, no failure window is specified. Default -1.
vm_max_failures
Maximum number of failures and automated resets allowed during the time that maxFailureWindow specifies.
If maxFailureWindow is -1 (no window), this represents the absolute number of failures after which automated response is stopped.
If a virtual machine exceeds this threshold, in-depth problem analysis is usually needed.
The default value is 3.
vm_failure_interval
If no heartbeat has been received for at least the specified number of seconds, the virtual machine is declared as failed.
The default value is 30.
isolation_response
Indicates whether or not the virtual machine should be powered off if a host determines that it is isolated from the rest of the compute resource.
If not specified at either the cluster level or the virtual machine level, this will default to powerOff.
restart_priority
Restart priority for a virtual machine.
If not specified at either the cluster level or the virtual machine level, this will default to medium.
restart_priority_timeout
This setting is used to specify a maximum time the lower priority VMs should wait for the higher priority VMs to be ready.
If the higher priority Vms are not ready by this time, then the lower priority VMs are restarted irrespective of the VM ready state.
This timeout can be used to prevent the failover of lower priority VMs to be stuck infinitely.
Default - 120
enable_apd_timeout_for_hosts
This property indicates if APD timeout will be enabled for all the hosts in the cluster when vSphere HA is configured.
Default - False
vm_reaction_on_apd_cleared
Action taken by VM Component Protection service for a powered on VM when APD condition clears after APD timeout.
Default - none
vm_storage_protection_for_apd
VM storage protection setting for storage failures categorized as All Paths Down (APD).
Valid values - disabled, warning, restartConservative, restartAggressive, clusterDefault. Default - warning
vm_storage_protection_for_pdl
VM storage protection setting for storage failures categorized as Permenant Device Loss (PDL).
Valid values - disabled, warning, restartConservative, restartAggressive, clusterDefault. Default - warning
vm_terminate_delay_for_apd_sec
The time interval after an APD timeout has been declared and before VM Component Protection service will terminate the VM.
Default 180 seconds.
admission_control_policy
Specify the admission control policy for the cluster as a dictionary.
.. code-block:: json
{
"slot_based_admission_control": {
"failover_level": 1,
"resource_reduction_to_tolerate_percent": 20
}
}
{
"failover_host_admission_control": {
"failover_level": 10,
"resource_reduction_to_tolerate_percent": 30,
"failover_hosts": ["host1", "host2"]
}
}
{
"reservation_based_admission_control": {
"failover_level": 22,
"resource_reduction_to_tolerate_percent": 33,
"autocompute_percentages": false,
"cpu_failover_resources_percent": 45,
"memory_failover_resources_percent": 56
}
}
advanced_settings
Advanced options for the cluster, to be passed in as a dictionary.
CLI Example:
.. code-block:: bash
salt '*' vmware_cluster_ha.configure cluster1 dc1 enable=True
"""
if service_instance is None:
service_instance = get_service_instance(opts=__opts__, pillar=__pillar__)
admission_control_policy = admission_control_policy or {}
try:
dc_ref = utils_datacenter.get_datacenter(service_instance, datacenter)
cluster_ref = utils_cluster.get_cluster(dc_ref=dc_ref, cluster=cluster)
cluster_spec = vim.cluster.ConfigSpecEx()
cluster_spec.dasConfig = vim.cluster.DasConfigInfo()
cluster_spec.dasConfig.enabled = enable
cluster_spec.dasConfig.hostMonitoring = host_monitoring
cluster_spec.dasConfig.vmMonitoring = vm_monitoring
cluster_spec.dasConfig.vmComponentProtecting = vm_component_protecting
vm_tool_spec = vim.cluster.VmToolsMonitoringSettings()
vm_tool_spec.vmMonitoring = vm_monitoring
vm_tool_spec.minUpTime = vm_min_up_time
vm_tool_spec.maxFailureWindow = vm_max_failure_window
vm_tool_spec.maxFailures = vm_max_failures
vm_tool_spec.failureInterval = vm_failure_interval
das_spec = vim.cluster.DasVmSettings()
das_spec.isolationResponse = isolation_response
das_spec.restartPriority = restart_priority
das_spec.restartPriorityTimeout = restart_priority_timeout
das_spec.vmToolsMonitoringSettings = vm_tool_spec
component_protection_spec = vim.cluster.VmComponentProtectionSettings()
component_protection_spec.enableAPDTimeoutForHosts = enable_apd_timeout_for_hosts
component_protection_spec.vmReactionOnAPDCleared = vm_reaction_on_apd_cleared
component_protection_spec.vmStorageProtectionForAPD = vm_storage_protection_for_apd
component_protection_spec.vmStorageProtectionForPDL = vm_storage_protection_for_pdl
component_protection_spec.vmTerminateDelayForAPDSec = vm_terminate_delay_for_apd_sec
das_spec.vmComponentProtectionSettings = component_protection_spec
_set_admission_control_params(
cluster_spec=cluster_spec,
admission_control_policy=admission_control_policy,
service_instance=service_instance,
datacenter=datacenter,
cluster=cluster,
)
cluster_spec.dasConfig.defaultVmSettings = das_spec
cluster_spec.dasConfig.option = []
for key in advanced_options or {}:
cluster_spec.dasConfig.option.append(
vim.OptionValue(key=key, value=advanced_options[key])
)
utils_cluster.update_cluster(cluster_ref=cluster_ref, cluster_spec=cluster_spec)
except (salt.exceptions.VMwareApiError, salt.exceptions.VMwareRuntimeError) as exc:
return {cluster: False, "reason": str(exc)}
return {cluster: True}
[docs]def get(cluster_name, datacenter_name, service_instance=None):
"""
Get HA info about a cluster in a datacenter
cluster_name
The cluster name
datacenter_name
The datacenter name to which the cluster belongs
service_instance
Use this vCenter service connection instance instead of creating a new one. (optional).
CLI Example:
.. code-block:: bash
salt '*' vmware_cluster_ha.get cluster_name=cl1 datacenter_name=dc1
"""
ret = {}
if service_instance is None:
service_instance = get_service_instance(opts=__opts__, pillar=__pillar__)
try:
dc_ref = utils_datacenter.get_datacenter(service_instance, datacenter_name)
cluster_ref = utils_cluster.get_cluster(dc_ref=dc_ref, cluster=cluster_name)
das_config = cluster_ref.configurationEx.dasConfig
ret["enabled"] = das_config.enabled
ret["host_monitoring"] = das_config.hostMonitoring
ret["vm_monitoring"] = das_config.vmMonitoring
ret["vm_component_protecting"] = das_config.vmComponentProtecting
if das_config.defaultVmSettings:
vm_tools_monitoring_settings = das_config.defaultVmSettings.vmToolsMonitoringSettings
ret["vm_monitoring"] = vm_tools_monitoring_settings.vmMonitoring
ret["vm_min_up_time"] = vm_tools_monitoring_settings.minUpTime
ret["vm_max_failure_window"] = vm_tools_monitoring_settings.maxFailureWindow
ret["vm_max_failures"] = vm_tools_monitoring_settings.maxFailures
ret["vm_failure_interval"] = vm_tools_monitoring_settings.failureInterval
ret["isolation_response"] = das_config.defaultVmSettings.isolationResponse
ret["restart_priority"] = das_config.defaultVmSettings.restartPriority
ret["restart_priority_timeout"] = das_config.defaultVmSettings.restartPriorityTimeout
component_protection_settings = (
das_config.defaultVmSettings.vmComponentProtectionSettings
)
ret[
"enable_apd_timeout_for_hosts"
] = component_protection_settings.enableAPDTimeoutForHosts
ret["vm_reaction_on_apd_cleared"] = component_protection_settings.vmReactionOnAPDCleared
ret[
"vm_storage_protection_for_apd"
] = component_protection_settings.vmStorageProtectionForAPD
ret[
"vm_storage_protection_for_pdl"
] = component_protection_settings.vmStorageProtectionForPDL
ret[
"vm_terminate_delay_for_apd_sec"
] = component_protection_settings.vmTerminateDelayForAPDSec
ret["admission_control_enabled"] = das_config.admissionControlEnabled
ret["admission_control_policy"] = None
ret["failover_level"] = das_config.admissionControlPolicy.failoverLevel
ret[
"resource_reduction_to_tolerate_percent"
] = das_config.admissionControlPolicy.resourceReductionToToleratePercent
if isinstance(
das_config.admissionControlPolicy,
vim.cluster.FailoverLevelAdmissionControlPolicy,
):
ret["admission_control_policy"] = "slot_based_admission_control"
elif isinstance(
das_config.admissionControlPolicy,
vim.cluster.FailoverHostAdmissionControlPolicy,
):
ret["admission_control_policy"] = "failover_host_admission_control"
ret["failover_hosts"] = [
h.name for h in das_config.admissionControlPolicy.failoverHosts
]
elif isinstance(
das_config.admissionControlPolicy,
vim.cluster.FailoverResourcesAdmissionControlPolicy,
):
ret["admission_control_policy"] = "reservation_based_admission_control"
ret[
"autocompute_percentages"
] = das_config.admissionControlPolicy.autoComputePercentages
ret[
"cpu_failover_resources_percent"
] = das_config.admissionControlPolicy.cpuFailoverResourcesPercent
ret[
"memory_failover_resources_percent"
] = das_config.admissionControlPolicy.memoryFailoverResourcesPercent
ret["advanced_settings"] = {}
for obj in cluster_ref.configurationEx.dasConfig.option:
ret["advanced_settings"][obj.key] = obj.value
except (salt.exceptions.VMwareApiError, salt.exceptions.VMwareRuntimeError) as exc:
return {cluster_name: False, "reason": str(exc)}
return ret