Skip to content

Commit

Permalink
[PLAT-15353] Consistency checks testing hooks
Browse files Browse the repository at this point in the history
Summary:
This diff adds some testing hooks for consistency check testing.
test_pending will shutdown YBA immediately after the update is applied to the DB, to test handling of pending tasks on YBA restart
ysql_timeout_secs is a flag that allows configuration of ysql timeout for consistency check commands (default is 30s)
update_delay_secs allows you to introduce a sleep after the DB commit occurs, simulating a slow/unreliable network connection. During this time YBA can be interrupted or ysql timeout may occur.

Also adds some minor improvements around the consistency_check table (renamed to yba_consistency_check) and added yw_uuid and yw_ip columns that may be useful going forward. Only the yw_uuid is populated for now.

Test Plan:
consistency check with flags enabled, ensure failures are triggered
regular task execution not impacted

Reviewers: sanketh, dshubin, nsingh, anijhawan

Reviewed By: sanketh, nsingh

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D37950
  • Loading branch information
mchiddy committed Sep 17, 2024
1 parent def0fac commit 87a936a
Show file tree
Hide file tree
Showing 40 changed files with 398 additions and 198 deletions.
3 changes: 2 additions & 1 deletion managed/RUNTIME-FLAGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@
| "Use server broadcast address for yb_backup" | "yb.backup.use_server_broadcast_address_for_yb_backup" | "UNIVERSE" | "Controls whether server_broadcast_address entry should be used during yb_backup.py backup/restore" | "Boolean" |
| "Slow Queries Timeout" | "yb.query_stats.slow_queries.timeout_secs" | "UNIVERSE" | "Timeout in secs for slow queries" | "Long" |
| "YSQL Queries Timeout" | "yb.ysql_timeout_secs" | "UNIVERSE" | "Timeout in secs for YSQL queries" | "Long" |
| "YSQL Queries Timeout for Consistency Check Operations" | "yb.universe.consistency_check.ysql_timeout_secs" | "UNIVERSE" | "Timeout in secs for YSQL queries" | "Long" |
| "Number of cores to keep" | "yb.num_cores_to_keep" | "UNIVERSE" | "Controls the configuration to set the number of cores to keep in the Ansible layer" | "Integer" |
| "Whether to check YBA xCluster object is in sync with DB replication group" | "yb.xcluster.ensure_sync_get_replication_status" | "UNIVERSE" | "It ensures that the YBA XCluster object for tables that are in replication is in sync with replication group in DB. If they are not in sync and this is true, getting the xCluster object will throw an exception and the user has to resync the xCluster config." | "Boolean" |
| "Network Load balancer health check ports" | "yb.universe.network_load_balancer.custom_health_check_ports" | "UNIVERSE" | "Ports to use for health checks performed by the network load balancer. Invalid and duplicate ports will be ignored. For GCP, only the first health check port would be used." | "Integer List" |
Expand Down Expand Up @@ -256,6 +257,6 @@
| "Enable health checks for time drift between nodes" | "yb.health_checks.check_clock_time_drift" | "UNIVERSE" | "Enable health checks for time drift between nodes." | "Boolean" |
| "Time drift threshold for warning health check" | "yb.health_checks.time_drift_wrn_threshold_ms" | "UNIVERSE" | "Threshold to raise a warning when time drift exceeds this amount" | "Integer" |
| "Time drift threshold for error health check" | "yb.health_checks.time_drift_err_threshold_ms" | "UNIVERSE" | "Threshold to raise a error when time drift exceeds this amount" | "Integer" |
| "Enable consistency check for universe" | "yb.universe.consistency_check_enabled" | "UNIVERSE" | "When enabled, all universe operations will attempt consistency check validation before proceeding. Turn off in disaster scenarios to force perform actions." | "Boolean" |
| "Enable consistency check for universe" | "yb.universe.consistency_check.enabled" | "UNIVERSE" | "When enabled, all universe operations will attempt consistency check validation before proceeding. Turn off in disaster scenarios to force perform actions." | "Boolean" |
| "Fail the the health check if no clock sync service is found" | "yb.health_checks.clock_sync_service_required" | "UNIVERSE" | "Require chrony or ntp(d) to be installed for health check to pass" | "Boolean" |
| "Node Agent Enabler Installation Time-out" | "yb.node_agent.enabler.install_timeout" | "UNIVERSE" | "Node agent enabler installation time-out for the universe" | "Duration" |
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,27 @@ public CustomBuilder taskTypes(Collection<? extends TaskType> taskTypes) {
TaskType.DeleteBackupScheduleKubernetes,
TaskType.EnableNodeAgentInUniverse);

private static final Set<TaskType> SKIP_CONSISTENCY_CHECK_TASKS =
ImmutableSet.of(
TaskType.CreateBackup,
TaskType.CreateBackupSchedule,
TaskType.CreateBackupScheduleKubernetes,
TaskType.CreateKubernetesUniverse,
TaskType.CreateSupportBundle,
TaskType.CreateUniverse,
TaskType.BackupUniverse,
TaskType.DeleteBackupSchedule,
TaskType.DeleteBackupScheduleKubernetes,
TaskType.DeleteDrConfig,
TaskType.DeletePitrConfig,
TaskType.DeleteXClusterConfig,
TaskType.DestroyUniverse,
TaskType.DestroyKubernetesUniverse,
TaskType.EditBackupSchedule,
TaskType.EditBackupScheduleKubernetes,
TaskType.MultiTableBackup,
TaskType.ReadOnlyClusterDelete);

private static final Set<TaskType> RERUNNABLE_PLACEMENT_MODIFICATION_TASKS =
ImmutableSet.of(
TaskType.GFlagsUpgrade,
Expand Down Expand Up @@ -1172,6 +1193,13 @@ public Universe lockAndFreezeUniverseForUpdate(
Universe universe = lockUniverseForUpdate(universeUuid, updater);
try {
createPrecheckTasks(universe);
TaskType taskType = getTaskExecutor().getTaskType(getClass());
if (!SKIP_CONSISTENCY_CHECK_TASKS.contains(taskType)
&& confGetter.getConfForScope(universe, UniverseConfKeys.enableConsistencyCheck)
&& universe.getUniverseDetails().getPrimaryCluster().userIntent.replicationFactor > 1) {
log.info("Creating consistency check task for task {}", taskType);
checkAndCreateConsistencyCheckTableTask(universe.getUniverseDetails().getPrimaryCluster());
}
if (isFirstTry()) {
createFreezeUniverseTask(universeUuid, firstRunTxnCallback)
.setSubTaskGroupType(SubTaskGroupType.ValidateConfigurations);
Expand All @@ -1181,14 +1209,6 @@ public Universe lockAndFreezeUniverseForUpdate(
createFreezeUniverseTask(universeUuid)
.setSubTaskGroupType(SubTaskGroupType.ValidateConfigurations);
}
if (confGetter.getConfForScope(universe, UniverseConfKeys.enableConsistencyCheck)) {
TaskType taskType = getTaskExecutor().getTaskType(getClass());
if (taskType != TaskType.CreateUniverse && taskType != TaskType.CreateKubernetesUniverse) {
log.info("Creating consistency check task for task {}", taskType);
checkAndCreateConsistencyCheckTableTask(
universe.getUniverseDetails().getPrimaryCluster());
}
}
return Universe.getOrBadRequest(universeUuid);
} catch (RuntimeException e) {
unlockUniverseForUpdate(universeUuid);
Expand Down Expand Up @@ -1462,7 +1482,7 @@ public void createDropSystemPlatformDBTablesTask(
universe,
CommonTypes.TableType.PGSQL_TABLE_TYPE,
Util.SYSTEM_PLATFORM_DB,
Util.CONSISTENCY_CHECK)
Util.CONSISTENCY_CHECK_TABLE_NAME)
.setSubTaskGroupType(subTaskGroupType);
}

Expand Down
Loading

0 comments on commit 87a936a

Please sign in to comment.