Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-128203 / 25.04 / reboot plugin #14530

Merged
merged 5 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/middlewared/middlewared/api/base/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
def api_method(
accepts: type[BaseModel],
returns: type[BaseModel],
*,
audit: str | None = None,
audit_callback: bool = False,
audit_extended: Callable[..., str] | None = None,
Expand Down
2 changes: 2 additions & 0 deletions src/middlewared/middlewared/api/v25_04_0/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from .cloud_sync import * # noqa
from .common import * # noqa
from .core import * # noqa
from .failover_reboot import * # noqa
from .group import * # noqa
from .keychain import * # noqa
from .privilege import * # noqa
from .system_reboot import * # noqa
from .user import * # noqa
from .vendor import * # noqa
27 changes: 27 additions & 0 deletions src/middlewared/middlewared/api/v25_04_0/failover_reboot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) - iXsystems Inc.
#
# Licensed under the terms of the TrueNAS Enterprise License Agreement
# See the file LICENSE.IX for complete terms and conditions
from middlewared.api.base import BaseModel, single_argument_result
from .system_reboot import SystemRebootInfoResult

__all__ = ["FailoverRebootInfoArgs", "FailoverRebootInfoResult",
"FailoverRebootOtherNodeArgs", "FailoverRebootOtherNodeResult"]


class FailoverRebootInfoArgs(BaseModel):
pass


@single_argument_result
class FailoverRebootInfoResult(BaseModel):
this_node: SystemRebootInfoResult
other_node: SystemRebootInfoResult | None


class FailoverRebootOtherNodeArgs(BaseModel):
pass


class FailoverRebootOtherNodeResult(BaseModel):
result: None
18 changes: 18 additions & 0 deletions src/middlewared/middlewared/api/v25_04_0/system_reboot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from middlewared.api.base import BaseModel, single_argument_result

__all__ = ["SystemRebootInfoArgs", "SystemRebootInfoResult"]


class SystemRebootInfoArgs(BaseModel):
pass


class RebootRequiredReason(BaseModel):
code: str
reason: str


@single_argument_result
class SystemRebootInfoResult(BaseModel):
boot_id: str
reboot_required_reasons: list[RebootRequiredReason]
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ def get_local_reasons(self, app, ifaces, reasons):
return

reboot_info = self.middleware.call_sync('failover.reboot.info')
if reboot_info['this_node']['reboot_required']:
if reboot_info['this_node']['reboot_required_reasons']:
reasons.add(DisabledReasonsEnum.LOC_FIPS_REBOOT_REQ.name)
if reboot_info['other_node']['reboot_required']:
if reboot_info['other_node'] is not None and reboot_info['other_node']['reboot_required_reasons']:
reasons.add(DisabledReasonsEnum.REM_FIPS_REBOOT_REQ.name)

if self.SYSTEM_DATASET_SETUP_IN_PROGRESS:
Expand Down
159 changes: 97 additions & 62 deletions src/middlewared/middlewared/plugins/failover_/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,23 @@
# Licensed under the terms of the TrueNAS Enterprise License Agreement
# See the file LICENSE.IX for complete terms and conditions
import asyncio
from dataclasses import dataclass
import errno
import time

from middlewared.schema import accepts, Bool, Dict, UUID, returns
from middlewared.api import api_method
from middlewared.api.current import (
FailoverRebootInfoArgs, FailoverRebootInfoResult,
FailoverRebootOtherNodeArgs, FailoverRebootOtherNodeResult,
)
from middlewared.service import CallError, job, private, Service

FIPS_KEY = 'fips_toggled'

@dataclass
class RemoteRebootReason:
# Boot ID for which the reboot is required. `None` means that the system must be rebooted when it comes online.
boot_id: str | None
reason: str


class FailoverRebootService(Service):
Expand All @@ -18,72 +28,74 @@ class Config:
cli_namespace = 'system.failover.reboot'
namespace = 'failover.reboot'

remote_reboot_reasons: dict[str, RemoteRebootReason] = {}

@private
async def boot_ids(self):
info = {
'this_node': {'id': await self.middleware.call('system.boot_id')},
'other_node': {'id': None}
}
async def add_remote_reason(self, code: str, reason: str):
"""
Adds a reason for why the remote system needs a reboot.
This will be appended to the list of the reasons that the remote node itself returns.
:param code: unique identifier for the reason.
:param reason: text explanation for the reason.
"""
try:
info['other_node']['id'] = await self.middleware.call(
'failover.call_remote', 'system.boot_id', [], {
'raise_connect_error': False, 'timeout': 2, 'connect_timeout': 2,
}
)
boot_id = await self.middleware.call('failover.call_remote', 'system.boot_id', [], {
'raise_connect_error': False,
'timeout': 2,
'connect_timeout': 2,
})
except Exception:
self.logger.warning('Unexpected error querying remote node boot id', exc_info=True)
self.logger.warning('Unexpected error querying remote reboot boot id', exc_info=True)
# Remote system is inaccessible, so, when it comes back, another reboot will be required.
boot_id = None

return info
self.remote_reboot_reasons[code] = RemoteRebootReason(boot_id, reason)

@private
async def info_impl(self):
# initial state
current_info = await self.boot_ids()
current_info['this_node'].update({'reboot_required': None})
current_info['other_node'].update({'reboot_required': None})

fips_change_info = await self.middleware.call('keyvalue.get', FIPS_KEY, False)
if not fips_change_info:
for i in current_info:
current_info[i]['reboot_required'] = False
return current_info

for key in ('this_node', 'other_node'):
current_info[key]['reboot_required'] = all((
fips_change_info[key]['id'] == current_info[key]['id'],
fips_change_info[key]['reboot_required']
))

if all((
current_info['this_node']['reboot_required'] is False,
current_info['other_node']['reboot_required'] is False,
)):
# no reboot required for either controller so delete
await self.middleware.call('keyvalue.delete', FIPS_KEY)

return current_info

@accepts(roles=['FAILOVER_READ'])
@returns(Dict(
'info',
Dict('this_node', UUID('id'), Bool('reboot_required')),
Dict('other_node', UUID('id', null=True), Bool('reboot_required', null=True)),
))
await self.send_event()

@api_method(FailoverRebootInfoArgs, FailoverRebootInfoResult, roles=['FAILOVER_READ'])
async def info(self):
"""Returns the local and remote nodes boot_ids along with their
reboot statuses (i.e. does a reboot need to take place)"""
return await self.info_impl()

@accepts(roles=['FAILOVER_READ'])
@returns(Bool())
async def required(self):
"""Returns whether this node needs to be rebooted for failover/security
system configuration changes to take effect."""
# TODO: should we raise Callerror/ValidationError if reboot_required is None?
return (await self.info())['this_node']['reboot_required'] is True

@accepts(roles=['FULL_ADMIN'])
@returns()
changed = False

try:
other_node = await self.middleware.call('failover.call_remote', 'system.reboot.info', [], {
'raise_connect_error': False,
'timeout': 2,
'connect_timeout': 2,
})
except Exception:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You'll need to handle ENOMETHOD a bit more gracefully here since this will always fail on HA upgrade.

self.logger.warning('Unexpected error querying remote reboot info', exc_info=True)
other_node = None

if other_node is not None:
for remote_reboot_reason_code, remote_reboot_reason in list(self.remote_reboot_reasons.items()):
if remote_reboot_reason.boot_id is None:
# This reboot reason was added while the remote node was not functional.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason why I used the keyvalue plugin was to avoid double-reboot scenarios like this so I would interpret this new behavior as a regression. Was there a reason why you stopped using keyvalue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct. I added the persistence for self.remote_reboot_reasons via keyvalue plugin.

# In that case, when the remote system comes online, an additional reboot is required.
remote_reboot_reason.boot_id = other_node['boot_id']
changed = True

if remote_reboot_reason.boot_id == other_node['boot_id']:
other_node['reboot_required_reasons'].append({
'code': remote_reboot_reason_code,
'reason': remote_reboot_reason.reason,
})
else:
# The system was rebooted, this reason is not valid anymore
self.remote_reboot_reasons.pop(remote_reboot_reason_code)
changed = True

info = {
'this_node': await self.middleware.call('system.reboot.info'),
'other_node': other_node,
}

if changed:
await self.send_event(info)

return info

@api_method(FailoverRebootOtherNodeArgs, FailoverRebootOtherNodeResult, roles=['FULL_ADMIN'])
@job(lock='reboot_standby')
async def other_node(self, job):
"""
Expand Down Expand Up @@ -132,3 +144,26 @@ async def other_node(self, job):

job.set_progress(100, 'Other controller rebooted successfully')
return True

@private
async def send_event(self, info=None):
if info is None:
info = await self.info()

self.middleware.send_event('failover.reboot.info', 'CHANGED', id=None, fields=info)


async def reboot_info(middleware, *args, **kwargs):
await middleware.call('failover.reboot.send_event')


def remote_reboot_info(middleware, *args, **kwargs):
middleware.call_sync('failover.reboot.send_event', background=True)


async def setup(middleware):
middleware.event_register('failover.reboot.info', 'Sent when a system reboot is required.', roles=['FAILOVER_READ'])

middleware.event_subscribe('system.reboot.info', remote_reboot_info)
await middleware.call('failover.remote_on_connect', remote_reboot_info)
await middleware.call('failover.remote_subscribe', 'system.reboot.info', remote_reboot_info)
10 changes: 5 additions & 5 deletions src/middlewared/middlewared/plugins/failover_/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def connect_and_wait(self, *, legacy=False):
try:
with Client(url, reserved_ports=True) as c:
self.client = c
self.connected.set()
# Subscribe to all events on connection
with self._subscribe_lock:
self.connected.set()
# Subscribe to all events on connection
for name in self._subscriptions:
self.client.subscribe(name, partial(self._sub_callback, name))
self._on_connect()
Expand Down Expand Up @@ -162,9 +162,9 @@ def call(self, *args, **kwargs):
raise CallError(str(e), e.errno or errno.EFAULT)

def subscribe(self, name, callback):
# Only subscribe if we are already connected, otherwise simply register it
if name not in self._subscriptions and self.is_connected():
with self._subscribe_lock:
with self._subscribe_lock:
# Only subscribe if we are already connected, otherwise simply register it
if name not in self._subscriptions and self.is_connected():
self.client.subscribe(name, partial(self._sub_callback, name))
self._subscriptions[name].append(callback)

Expand Down
41 changes: 10 additions & 31 deletions src/middlewared/middlewared/plugins/security/update.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import middlewared.sqlalchemy as sa

from middlewared.plugins.failover_.disabled_reasons import DisabledReasonsEnum
from middlewared.plugins.failover_.reboot import FIPS_KEY
from middlewared.schema import accepts, Bool, Dict, Int, Patch
from middlewared.service import ConfigService, ValidationError, job, private

FIPS_REBOOT_CODE = 'FIPS'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather create an enum class in the system/reboot_reasons.py file (or similar) and use that instead of having random global variables in different plugins. This will become important for tools like hactl.

FIPS_REBOOT_REASON = 'FIPS configuration was changed.'


class SystemSecurityModel(sa.Model):
__tablename__ = 'system_security'
Expand Down Expand Up @@ -32,46 +34,23 @@ async def configure_fips_on_ha(self, is_ha, job):
return

await self.middleware.call('failover.call_remote', 'etc.generate', ['fips'])
boot_info = await self.middleware.call('failover.reboot.info')
if boot_info['this_node']['reboot_required']:
# means FIPS is being toggled but this node is already pending a reboot
# so it means the user toggled FIPS twice without a reboot in between
boot_info['this_node']['reboot_required'] = False
await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
else:
# means FIPS is toggled and this node isn't pending a reboot, so mark it
# as such
boot_info['this_node']['reboot_required'] = True
await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)

if boot_info['other_node']['reboot_required']:
# means FIPS is being toggled but other node is already pending a reboot
remote_reboot_reasons = await self.middleware.call('failover.call_remote', 'system.reboot.list_reasons')
if FIPS_REBOOT_CODE in remote_reboot_reasons:
# means FIPS is being toggled but other node is already pending a reboot,
# so it means the user toggled FIPS twice and somehow the other node
# didn't reboot (even though we do this automatically). This is an edge
# case and means someone or something is doing things behind our backs
boot_info['other_node']['reboot_required'] = False
await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
await self.middleware.call('failover.call_remote', 'system.reboot.remove_reason', [FIPS_REBOOT_CODE])
else:
try:
# we automatically reboot (and wait for) the other controller
reboot_job = await self.middleware.call('failover.reboot.other_node')
await job.wrap(reboot_job)
except Exception:
self.logger.error('Unexpected failure rebooting the other node', exc_info=True)
# something extravagant happened so we'll just play it safe and say that
# something extravagant happened, so we'll just play it safe and say that
# another reboot is required
boot_info['other_node']['reboot_required'] = True
else:
new_info = await self.middleware.call('failover.reboot.info')
if boot_info['other_node']['id'] == new_info['other_node']['id']:
# standby "rebooted" but the boot id is the same....not good
self.logger.warning('Other node claims it rebooted but boot id is the same')
boot_info['other_node']['reboot_required'] = True
else:
boot_info['other_node']['id'] = new_info['other_node']['id']
boot_info['other_node']['reboot_required'] = False

await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
await self.middleware.call('failover.reboot.add_remote_reason', FIPS_REBOOT_CODE, FIPS_REBOOT_REASON)

@private
async def validate(self, is_ha, ha_disabled_reasons):
Expand Down Expand Up @@ -125,7 +104,7 @@ async def do_update(self, job, data):
# TODO: We likely need to do some SSH magic as well
# let's investigate the exact configuration there
await self.middleware.call('etc.generate', 'fips')
# TODO: We need to fix this for non-HA iX hardware...
await self.middleware.call('system.reboot.toggle_reason', FIPS_REBOOT_CODE, FIPS_REBOOT_REASON)
await self.configure_fips_on_ha(is_ha, job)

return await self.config()
Loading
Loading