truenas · themylogin · Sep 20, 2024 · Sep 12, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/src/middlewared/middlewared/api/base/decorator.py b/src/middlewared/middlewared/api/base/decorator.py
@@ -12,6 +12,7 @@
 def api_method(
     accepts: type[BaseModel],
     returns: type[BaseModel],
+    *,
     audit: str | None = None,
     audit_callback: bool = False,
     audit_extended: Callable[..., str] | None = None,

diff --git a/src/middlewared/middlewared/api/v25_04_0/__init__.py b/src/middlewared/middlewared/api/v25_04_0/__init__.py
@@ -3,8 +3,10 @@
 from .cloud_sync import *  # noqa
 from .common import *  # noqa
 from .core import *  # noqa
+from .failover_reboot import *  # noqa
 from .group import *  # noqa
 from .keychain import *  # noqa
 from .privilege import *  # noqa
+from .system_reboot import *  # noqa
 from .user import *  # noqa
 from .vendor import *  # noqa
diff --git a/src/middlewared/middlewared/api/v25_04_0/failover_reboot.py b/src/middlewared/middlewared/api/v25_04_0/failover_reboot.py
@@ -0,0 +1,27 @@
+# Copyright (c) - iXsystems Inc.
+#
+# Licensed under the terms of the TrueNAS Enterprise License Agreement
+# See the file LICENSE.IX for complete terms and conditions
+from middlewared.api.base import BaseModel, single_argument_result
+from .system_reboot import SystemRebootInfoResult
+
+__all__ = ["FailoverRebootInfoArgs", "FailoverRebootInfoResult",
+           "FailoverRebootOtherNodeArgs", "FailoverRebootOtherNodeResult"]
+
+
+class FailoverRebootInfoArgs(BaseModel):
+    pass
+
+
+@single_argument_result
+class FailoverRebootInfoResult(BaseModel):
+    this_node: SystemRebootInfoResult
+    other_node: SystemRebootInfoResult | None
+
+
+class FailoverRebootOtherNodeArgs(BaseModel):
+    pass
+
+
+class FailoverRebootOtherNodeResult(BaseModel):
+    result: None
diff --git a/src/middlewared/middlewared/api/v25_04_0/system_reboot.py b/src/middlewared/middlewared/api/v25_04_0/system_reboot.py
@@ -0,0 +1,18 @@
+from middlewared.api.base import BaseModel, single_argument_result
+
+__all__ = ["SystemRebootInfoArgs", "SystemRebootInfoResult"]
+
+
+class SystemRebootInfoArgs(BaseModel):
+    pass
+
+
+class RebootRequiredReason(BaseModel):
+    code: str
+    reason: str
+
+
+@single_argument_result
+class SystemRebootInfoResult(BaseModel):
+    boot_id: str
+    reboot_required_reasons: list[RebootRequiredReason]
diff --git a/src/middlewared/middlewared/plugins/failover_/disabled_reasons.py b/src/middlewared/middlewared/plugins/failover_/disabled_reasons.py
@@ -91,9 +91,9 @@ def get_local_reasons(self, app, ifaces, reasons):
             return
 
         reboot_info = self.middleware.call_sync('failover.reboot.info')
-        if reboot_info['this_node']['reboot_required']:
+        if reboot_info['this_node']['reboot_required_reasons']:
             reasons.add(DisabledReasonsEnum.LOC_FIPS_REBOOT_REQ.name)
-        if reboot_info['other_node']['reboot_required']:
+        if reboot_info['other_node'] is not None and reboot_info['other_node']['reboot_required_reasons']:
             reasons.add(DisabledReasonsEnum.REM_FIPS_REBOOT_REQ.name)
 
         if self.SYSTEM_DATASET_SETUP_IN_PROGRESS:

diff --git a/src/middlewared/middlewared/plugins/failover_/reboot.py b/src/middlewared/middlewared/plugins/failover_/reboot.py
@@ -3,13 +3,23 @@
 # Licensed under the terms of the TrueNAS Enterprise License Agreement
 # See the file LICENSE.IX for complete terms and conditions
 import asyncio
+from dataclasses import dataclass
 import errno
 import time
 
-from middlewared.schema import accepts, Bool, Dict, UUID, returns
+from middlewared.api import api_method
+from middlewared.api.current import (
+    FailoverRebootInfoArgs, FailoverRebootInfoResult,
+    FailoverRebootOtherNodeArgs, FailoverRebootOtherNodeResult,
+)
 from middlewared.service import CallError, job, private, Service
 
-FIPS_KEY = 'fips_toggled'
+
+@dataclass
+class RemoteRebootReason:
+    # Boot ID for which the reboot is required. `None` means that the system must be rebooted when it comes online.
+    boot_id: str | None
+    reason: str
 
 
 class FailoverRebootService(Service):
@@ -18,72 +28,74 @@ class Config:
         cli_namespace = 'system.failover.reboot'
         namespace = 'failover.reboot'
 
+    remote_reboot_reasons: dict[str, RemoteRebootReason] = {}
+
     @private
-    async def boot_ids(self):
-        info = {
-            'this_node': {'id': await self.middleware.call('system.boot_id')},
-            'other_node': {'id': None}
-        }
+    async def add_remote_reason(self, code: str, reason: str):
+        """
+        Adds a reason for why the remote system needs a reboot.
+        This will be appended to the list of the reasons that the remote node itself returns.
+        :param code: unique identifier for the reason.
+        :param reason: text explanation for the reason.
+        """
         try:
-            info['other_node']['id'] = await self.middleware.call(
-                'failover.call_remote', 'system.boot_id', [], {
-                    'raise_connect_error': False, 'timeout': 2, 'connect_timeout': 2,
-                }
-            )
+            boot_id = await self.middleware.call('failover.call_remote', 'system.boot_id', [], {
+                'raise_connect_error': False,
+                'timeout': 2,
+                'connect_timeout': 2,
+            })
         except Exception:
-            self.logger.warning('Unexpected error querying remote node boot id', exc_info=True)
+            self.logger.warning('Unexpected error querying remote reboot boot id', exc_info=True)
+            # Remote system is inaccessible, so, when it comes back, another reboot will be required.
+            boot_id = None
 
-        return info
+        self.remote_reboot_reasons[code] = RemoteRebootReason(boot_id, reason)
 
-    @private
-    async def info_impl(self):
-        # initial state
-        current_info = await self.boot_ids()
-        current_info['this_node'].update({'reboot_required': None})
-        current_info['other_node'].update({'reboot_required': None})
-
-        fips_change_info = await self.middleware.call('keyvalue.get', FIPS_KEY, False)
-        if not fips_change_info:
-            for i in current_info:
-                current_info[i]['reboot_required'] = False
-            return current_info
-
-        for key in ('this_node', 'other_node'):
-            current_info[key]['reboot_required'] = all((
-                fips_change_info[key]['id'] == current_info[key]['id'],
-                fips_change_info[key]['reboot_required']
-            ))
-
-        if all((
-            current_info['this_node']['reboot_required'] is False,
-            current_info['other_node']['reboot_required'] is False,
-        )):
-            # no reboot required for either controller so delete
-            await self.middleware.call('keyvalue.delete', FIPS_KEY)
-
-        return current_info
-
-    @accepts(roles=['FAILOVER_READ'])
-    @returns(Dict(
-        'info',
-        Dict('this_node', UUID('id'), Bool('reboot_required')),
-        Dict('other_node', UUID('id', null=True), Bool('reboot_required', null=True)),
-    ))
+        await self.send_event()
+
+    @api_method(FailoverRebootInfoArgs, FailoverRebootInfoResult, roles=['FAILOVER_READ'])
     async def info(self):
-        """Returns the local and remote nodes boot_ids along with their
-        reboot statuses (i.e. does a reboot need to take place)"""
-        return await self.info_impl()
-
-    @accepts(roles=['FAILOVER_READ'])
-    @returns(Bool())
-    async def required(self):
-        """Returns whether this node needs to be rebooted for failover/security
-        system configuration changes to take effect."""
-        # TODO: should we raise Callerror/ValidationError if reboot_required is None?
-        return (await self.info())['this_node']['reboot_required'] is True
-
-    @accepts(roles=['FULL_ADMIN'])
-    @returns()
+        changed = False
+
+        try:
+            other_node = await self.middleware.call('failover.call_remote', 'system.reboot.info', [], {
+                'raise_connect_error': False,
+                'timeout': 2,
+                'connect_timeout': 2,
+            })
+        except Exception:
+            self.logger.warning('Unexpected error querying remote reboot info', exc_info=True)
+            other_node = None
+
+        if other_node is not None:
+            for remote_reboot_reason_code, remote_reboot_reason in list(self.remote_reboot_reasons.items()):
+                if remote_reboot_reason.boot_id is None:
+                    # This reboot reason was added while the remote node was not functional.
+                    # In that case, when the remote system comes online, an additional reboot is required.
+                    remote_reboot_reason.boot_id = other_node['boot_id']
+                    changed = True
+
+                if remote_reboot_reason.boot_id == other_node['boot_id']:
+                    other_node['reboot_required_reasons'].append({
+                        'code': remote_reboot_reason_code,
+                        'reason': remote_reboot_reason.reason,
+                    })
+                else:
+                    # The system was rebooted, this reason is not valid anymore
+                    self.remote_reboot_reasons.pop(remote_reboot_reason_code)
+                    changed = True
+
+        info = {
+            'this_node': await self.middleware.call('system.reboot.info'),
+            'other_node': other_node,
+        }
+
+        if changed:
+            await self.send_event(info)
+
+        return info
+
+    @api_method(FailoverRebootOtherNodeArgs, FailoverRebootOtherNodeResult, roles=['FULL_ADMIN'])
     @job(lock='reboot_standby')
     async def other_node(self, job):
         """
@@ -132,3 +144,26 @@ async def other_node(self, job):
 
         job.set_progress(100, 'Other controller rebooted successfully')
         return True
+
+    @private
+    async def send_event(self, info=None):
+        if info is None:
+            info = await self.info()
+
+        self.middleware.send_event('failover.reboot.info', 'CHANGED', id=None, fields=info)
+
+
+async def reboot_info(middleware, *args, **kwargs):
+    await middleware.call('failover.reboot.send_event')
+
+
+def remote_reboot_info(middleware, *args, **kwargs):
+    middleware.call_sync('failover.reboot.send_event', background=True)
+
+
+async def setup(middleware):
+    middleware.event_register('failover.reboot.info', 'Sent when a system reboot is required.', roles=['FAILOVER_READ'])
+
+    middleware.event_subscribe('system.reboot.info', remote_reboot_info)
+    await middleware.call('failover.remote_on_connect', remote_reboot_info)
+    await middleware.call('failover.remote_subscribe', 'system.reboot.info', remote_reboot_info)
diff --git a/src/middlewared/middlewared/plugins/failover_/remote.py b/src/middlewared/middlewared/plugins/failover_/remote.py
@@ -67,9 +67,9 @@ def connect_and_wait(self, *, legacy=False):
         try:
             with Client(url, reserved_ports=True) as c:
                 self.client = c
-                self.connected.set()
-                # Subscribe to all events on connection
                 with self._subscribe_lock:
+                    self.connected.set()
+                    # Subscribe to all events on connection
                     for name in self._subscriptions:
                         self.client.subscribe(name, partial(self._sub_callback, name))
                 self._on_connect()
@@ -162,9 +162,9 @@ def call(self, *args, **kwargs):
             raise CallError(str(e), e.errno or errno.EFAULT)
 
     def subscribe(self, name, callback):
-        # Only subscribe if we are already connected, otherwise simply register it
-        if name not in self._subscriptions and self.is_connected():
-            with self._subscribe_lock:
+        with self._subscribe_lock:
+            # Only subscribe if we are already connected, otherwise simply register it
+            if name not in self._subscriptions and self.is_connected():
                 self.client.subscribe(name, partial(self._sub_callback, name))
         self._subscriptions[name].append(callback)
 

diff --git a/src/middlewared/middlewared/plugins/security/update.py b/src/middlewared/middlewared/plugins/security/update.py
@@ -1,10 +1,12 @@
 import middlewared.sqlalchemy as sa
 
 from middlewared.plugins.failover_.disabled_reasons import DisabledReasonsEnum
-from middlewared.plugins.failover_.reboot import FIPS_KEY
 from middlewared.schema import accepts, Bool, Dict, Int, Patch
 from middlewared.service import ConfigService, ValidationError, job, private
 
+FIPS_REBOOT_CODE = 'FIPS'
+FIPS_REBOOT_REASON = 'FIPS configuration was changed.'
+
 
 class SystemSecurityModel(sa.Model):
     __tablename__ = 'system_security'
@@ -32,46 +34,23 @@ async def configure_fips_on_ha(self, is_ha, job):
             return
 
         await self.middleware.call('failover.call_remote', 'etc.generate', ['fips'])
-        boot_info = await self.middleware.call('failover.reboot.info')
-        if boot_info['this_node']['reboot_required']:
-            # means FIPS is being toggled but this node is already pending a reboot
-            # so it means the user toggled FIPS twice without a reboot in between
-            boot_info['this_node']['reboot_required'] = False
-            await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
-        else:
-            # means FIPS is toggled and this node isn't pending a reboot, so mark it
-            # as such
-            boot_info['this_node']['reboot_required'] = True
-            await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
 
-        if boot_info['other_node']['reboot_required']:
-            # means FIPS is being toggled but other node is already pending a reboot
+        remote_reboot_reasons = await self.middleware.call('failover.call_remote', 'system.reboot.list_reasons')
+        if FIPS_REBOOT_CODE in remote_reboot_reasons:
+            # means FIPS is being toggled but other node is already pending a reboot,
             # so it means the user toggled FIPS twice and somehow the other node
             # didn't reboot (even though we do this automatically). This is an edge
             # case and means someone or something is doing things behind our backs
-            boot_info['other_node']['reboot_required'] = False
-            await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
+            await self.middleware.call('failover.call_remote', 'system.reboot.remove_reason', [FIPS_REBOOT_CODE])
         else:
             try:
                 # we automatically reboot (and wait for) the other controller
                 reboot_job = await self.middleware.call('failover.reboot.other_node')
                 await job.wrap(reboot_job)
             except Exception:
-                self.logger.error('Unexpected failure rebooting the other node', exc_info=True)
-                # something extravagant happened so we'll just play it safe and say that
+                # something extravagant happened, so we'll just play it safe and say that
                 # another reboot is required
-                boot_info['other_node']['reboot_required'] = True
-            else:
-                new_info = await self.middleware.call('failover.reboot.info')
-                if boot_info['other_node']['id'] == new_info['other_node']['id']:
-                    # standby "rebooted" but the boot id is the same....not good
-                    self.logger.warning('Other node claims it rebooted but boot id is the same')
-                    boot_info['other_node']['reboot_required'] = True
-                else:
-                    boot_info['other_node']['id'] = new_info['other_node']['id']
-                    boot_info['other_node']['reboot_required'] = False
-
-            await self.middleware.call('keyvalue.set', FIPS_KEY, boot_info)
+                await self.middleware.call('failover.reboot.add_remote_reason', FIPS_REBOOT_CODE, FIPS_REBOOT_REASON)
 
     @private
     async def validate(self, is_ha, ha_disabled_reasons):
@@ -125,7 +104,7 @@ async def do_update(self, job, data):
             # TODO: We likely need to do some SSH magic as well
             #  let's investigate the exact configuration there
             await self.middleware.call('etc.generate', 'fips')
-            # TODO: We need to fix this for non-HA iX hardware...
+            await self.middleware.call('system.reboot.toggle_reason', FIPS_REBOOT_CODE, FIPS_REBOOT_REASON)
             await self.configure_fips_on_ha(is_ha, job)
 
         return await self.config()