Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

windows restart service #1681

Merged
merged 36 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ea333c1
add basic working restart service installation and subcommand
zackattack01 Apr 11, 2024
59c87cf
wire in opts from launcher proper and localLogger, more clean up
zackattack01 Apr 16, 2024
0bc3cc9
gofmt and remove unused account const
zackattack01 Apr 16, 2024
69d049e
stash changes for rebase with exit wrapper updates
zackattack01 Apr 30, 2024
90be693
fix up restart service to use new os.Exit wrapper
zackattack01 Apr 30, 2024
7997c98
add sqlite log publication
zackattack01 Apr 30, 2024
1648fae
update restart service to use new gowrapper
zackattack01 Apr 30, 2024
1b52d9e
clean up and close log publisher on interrupt
zackattack01 Apr 30, 2024
e0cdfd2
fix DeleteRows when called without rowids
zackattack01 May 1, 2024
c700844
add in enable_launcher_watchdog flag and update types+knapsack
zackattack01 May 1, 2024
b0e1262
remove watchdog when disabled, add launcher watchdog to options
zackattack01 May 1, 2024
eca53df
unstage tests, stash working version post refactor
zackattack01 May 14, 2024
0de997e
rework logstore interfaces, break off from kevalue, bugfixes
zackattack01 May 17, 2024
7d405e7
re-add staged fix for non-windows compilation
zackattack01 May 17, 2024
ac9a49b
test updates
zackattack01 May 17, 2024
04c0288
bring back test updates
zackattack01 May 17, 2024
2a54920
gofmt
zackattack01 May 17, 2024
3cd80b4
remove temporary option manipulation from local testing
zackattack01 May 29, 2024
c891283
remove flag from startup settings, reset tests
zackattack01 May 29, 2024
cf7144c
unstage options for PR breakout
zackattack01 May 30, 2024
0333b2f
add some documentation, small optimizations
zackattack01 Jun 3, 2024
88896fa
test updates
zackattack01 Jun 4, 2024
fbfeed8
add explicit install and removal logs, update docs
zackattack01 Jun 5, 2024
6bb1380
remove duplicate install log
zackattack01 Jun 5, 2024
fc198db
PR feedback: early returns for failed installation paths, better logging
zackattack01 Jun 5, 2024
c66661e
PR feedback: use backoff.Waitfor for service restart logic
zackattack01 Jun 5, 2024
d359bab
Apply suggestions from code review
zackattack01 Jun 6, 2024
ac8ff41
PR feedback: move watchdog package to ee, rename docs to README.md
zackattack01 Jun 6, 2024
d0aec27
PR feedback: const for serviceResetPeriod, logging and code flow impr…
zackattack01 Jun 6, 2024
733c591
Update ee/agent/storage/sqlite/logstore_sqlite.go
zackattack01 Jun 6, 2024
fc5b0f0
PR review: move docs to docs/architecture, add clarifying comments, m…
zackattack01 Jun 25, 2024
1be4237
PR feedback and refactor: bring logwriter logic directly into sqliteS…
zackattack01 Jun 25, 2024
2d68f1e
add logic to integrate power_event_watcher, don't restart when in mod…
zackattack01 Jul 1, 2024
7f4df2b
PR review: update rungroup name to be camelCase
zackattack01 Jul 2, 2024
ea0fbad
PR Review: add backoffs to service manager connection and service del…
zackattack01 Jul 2, 2024
093c89e
Update ee/watchdog/watchdog_service_windows.go
zackattack01 Jul 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions cmd/launcher/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/kolide/launcher/cmd/launcher/internal"
"github.com/kolide/launcher/ee/agent"
"github.com/kolide/launcher/ee/agent/flags"
"github.com/kolide/launcher/ee/agent/flags/keys"
"github.com/kolide/launcher/ee/agent/knapsack"
"github.com/kolide/launcher/ee/agent/startupsettings"
"github.com/kolide/launcher/ee/agent/storage"
Expand All @@ -41,6 +42,7 @@ import (
kolidelog "github.com/kolide/launcher/ee/log/osquerylogs"
"github.com/kolide/launcher/ee/powereventwatcher"
"github.com/kolide/launcher/ee/tuf"
"github.com/kolide/launcher/ee/watchdog"
"github.com/kolide/launcher/pkg/augeas"
"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/contexts/ctxlog"
Expand Down Expand Up @@ -283,6 +285,17 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl
go checkpointer.Once(ctx)
runGroup.Add("logcheckpoint", checkpointer.Run, checkpointer.Interrupt)

watchdogController, err := watchdog.NewController(ctx, k)
if err != nil { // log any issues here but move on, watchdog is not critical path
slogger.Log(ctx, slog.LevelError,
"could not init watchdog controller",
"err", err,
)
} else if watchdogController != nil { // watchdogController will be nil on non-windows platforms for now
k.RegisterChangeObserver(watchdogController, keys.LauncherWatchdogEnabled)
runGroup.Add("watchdogController", watchdogController.Run, watchdogController.Interrupt)
}

// Create a channel for signals
sigChannel := make(chan os.Signal, 1)

Expand Down
3 changes: 3 additions & 0 deletions cmd/launcher/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/kolide/kit/logutil"
"github.com/kolide/kit/version"
"github.com/kolide/launcher/ee/tuf"
"github.com/kolide/launcher/ee/watchdog"
"github.com/kolide/launcher/pkg/contexts/ctxlog"
"github.com/kolide/launcher/pkg/execwrapper"
"github.com/kolide/launcher/pkg/launcher"
Expand Down Expand Up @@ -193,6 +194,8 @@ func runSubcommands(systemMultiSlogger *multislogger.MultiSlogger) error {
run = runUninstall
case "secure-enclave":
run = runSecureEnclave
case "watchdog": // note: this is currently only implemented for windows
run = watchdog.RunWatchdogService
default:
return fmt.Errorf("unknown subcommand %s", os.Args[1])
}
Expand Down
2 changes: 2 additions & 0 deletions cmd/launcher/svc_config_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ func checkDependOnService(launcherServiceKey registry.Key, logger *slog.Logger)

// checkRestartActions checks the current value of our `SERVICE_FAILURE_ACTIONS_FLAG` and
// sets it to true if required. See https://learn.microsoft.com/en-us/windows/win32/api/winsvc/ns-winsvc-service_failure_actions_flag
// if we choose to implement restart backoff, that logic must be added here (it is not exposed via wix). See the "Windows Service Manager"
// doc in Notion for additional details on configurability
func checkRestartActions(logger *slog.Logger) {
sman, err := mgr.Connect()
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion cmd/launcher/svc_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func runWindowsSvc(systemSlogger *multislogger.MultiSlogger, args []string) erro
}

// Confirm that service configuration is up-to-date
checkServiceConfiguration(systemSlogger.Logger, opts)
checkServiceConfiguration(localSlogger.Logger, opts)

systemSlogger.Log(context.TODO(), slog.LevelInfo,
"launching service",
Expand Down
33 changes: 33 additions & 0 deletions docs/architecture/launcher_watchdog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
### Watchdog Service

Note that for the initial implementation, this service is windows only. It is intentionally designed to give room for alternate OS implementations if needed in the future.
Most of the relevant code can be found in [ee/watchdog](../../ee/watchdog/)

Here is a basic sequence diagram displaying the enable path for the windows watchdog service. The `launcher_watchdog_enabled` control flag will trigger the initial configuration and installation, and removal of the flag will trigger removal of the service.

```mermaid
sequenceDiagram
participant LauncherKolideK2Svc
Note right of LauncherKolideK2Svc: ./launcher.exe svc ...
create participant WindowsServiceManager
LauncherKolideK2Svc->>WindowsServiceManager: if launcher_watchdog_enabled
create participant LauncherKolideWatchdogSvc
WindowsServiceManager->>LauncherKolideWatchdogSvc: have we installed the watchdog?
Note left of LauncherKolideWatchdogSvc: ./launcher.exe watchdog

alt yes the service already exists
LauncherKolideK2Svc->>LauncherKolideWatchdogSvc: Restart to ensure latest
else no the service does not exist
LauncherKolideK2Svc->>WindowsServiceManager: 1 - create, configure, etc
LauncherKolideK2Svc->>LauncherKolideWatchdogSvc: 2 - Start
activate LauncherKolideWatchdogSvc
end

loop every n minutes
LauncherKolideWatchdogSvc->>WindowsServiceManager: Query LauncherKolideK2Svc status
LauncherKolideWatchdogSvc->>LauncherKolideK2Svc: Start if Stopped
end
```

The restart functionality is currently limited to detecting a stopped state, but the idea here is to lay out the foundation for more advanced healthchecking.
The watchdog service itself runs as a separate invocation of launcher, writing all logs to sqlite. The main invocation of launcher runs a watchdog controller, which responds to the `launcher_watchdog_enabled` flag, and publishes all sqlite logs to debug.json.
10 changes: 10 additions & 0 deletions ee/agent/flags/flag_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,16 @@ func (fc *FlagController) ExportTraces() bool {
).get(fc.getControlServerValue(keys.ExportTraces))
}

func (fc *FlagController) SetLauncherWatchdogEnabled(enabled bool) error {
return fc.setControlServerValue(keys.LauncherWatchdogEnabled, boolToBytes(enabled))
}

func (fc *FlagController) LauncherWatchdogEnabled() bool {
return NewBoolFlagValue(
WithDefaultBool(false),
).get(fc.getControlServerValue(keys.LauncherWatchdogEnabled))
}

func (fc *FlagController) SetTraceSamplingRate(rate float64) error {
return fc.setControlServerValue(keys.TraceSamplingRate, float64ToBytes(rate))
}
Expand Down
1 change: 1 addition & 0 deletions ee/agent/flags/keys/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const (
DisableTraceIngestTLS FlagKey = "disable_trace_ingest_tls"
InModernStandby FlagKey = "in_modern_standby"
LocalDevelopmentPath FlagKey = "localdev_path"
LauncherWatchdogEnabled FlagKey = "launcher_watchdog_enabled" // note that this will only impact windows deployments for now
)

func (key FlagKey) String() string {
Expand Down
7 changes: 7 additions & 0 deletions ee/agent/knapsack/knapsack.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@ func (k *knapsack) TokenStore() types.KVStore {
return k.getKVStore(storage.TokenStore)
}

func (k *knapsack) SetLauncherWatchdogEnabled(enabled bool) error {
return k.flags.SetLauncherWatchdogEnabled(enabled)
}
func (k *knapsack) LauncherWatchdogEnabled() bool {
return k.flags.LauncherWatchdogEnabled()
}

func (k *knapsack) getKVStore(storeType storage.Store) types.KVStore {
if k == nil {
return nil
Expand Down
23 changes: 23 additions & 0 deletions ee/agent/storage/sqlite/keyvalue_store_sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type storeName int

const (
StartupSettingsStore storeName = iota
WatchdogLogStore storeName = 1
)

var missingMigrationErrFormat = regexp.MustCompile(`no migration found for version \d+`)
Expand All @@ -33,6 +34,8 @@ func (s storeName) String() string {
switch s {
case StartupSettingsStore:
return "startup_settings"
case WatchdogLogStore:
return "watchdog_logs"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to push other logs here (say early startup logs) how would we do it? Would we make a new store? Accept it as slightly misnamed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think new store will give us more implementation flexibility and we can still re-use all of the new interfaces and wire that in pretty easily

}

return ""
Expand All @@ -48,6 +51,15 @@ type sqliteStore struct {
tableName string
}

type sqliteColumns struct {
pk string
valueColumn string
// isLogstore is used to determine whether the underlying table can support our LogStore interface methods.
// because any logstore iteration must scan the values into known types, we use this to avoid pulling in
// the reflect package here and making this more complicated than it needs to be
isLogstore bool
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
}

// OpenRO opens a connection to the database in the given root directory; it does
// not perform database creation or migration.
func OpenRO(ctx context.Context, rootDirectory string, name storeName) (*sqliteStore, error) {
Expand Down Expand Up @@ -323,6 +335,17 @@ ON CONFLICT (name) DO UPDATE SET value=excluded.value;`
return deletedKeys, nil
}

func (s *sqliteStore) getColumns() *sqliteColumns {
switch s.tableName {
case StartupSettingsStore.String():
return &sqliteColumns{pk: "name", valueColumn: "value", isLogstore: false}
case WatchdogLogStore.String():
return &sqliteColumns{pk: "timestamp", valueColumn: "log", isLogstore: true}
}

return nil
}

func isMissingMigrationError(err error) bool {
return missingMigrationErrFormat.MatchString(err.Error())
}
125 changes: 125 additions & 0 deletions ee/agent/storage/sqlite/logstore_sqlite.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package agentsqlite

import (
"errors"
"fmt"
"strings"
"time"
)

func (s *sqliteStore) AppendValue(timestamp int64, value []byte) error {
colInfo := s.getColumns()
if s == nil || s.conn == nil || colInfo == nil {
return errors.New("store is nil")
}

if s.readOnly {
return errors.New("cannot perform update with RO connection")
}

if !colInfo.isLogstore {
return errors.New("this table type does not support adding values by timestamp")
}

insertSql := fmt.Sprintf(
`INSERT INTO %s (%s, %s) VALUES (?, ?)`,
directionless marked this conversation as resolved.
Show resolved Hide resolved
s.tableName,
colInfo.pk,
colInfo.valueColumn,
)

if _, err := s.conn.Exec(insertSql, timestamp, value); err != nil {
return fmt.Errorf("appending row into %s: %w", s.tableName, err)
}

return nil
}

func (s *sqliteStore) DeleteRows(rowids ...any) error {
if s == nil || s.conn == nil {
return errors.New("store is nil")
}

if s.readOnly {
return errors.New("cannot perform deletes with RO connection")
}

if len(rowids) == 0 {
return nil
}

// interpolate the proper number of question marks
paramQs := strings.Repeat("?,", len(rowids))
paramQs = paramQs[:len(paramQs)-1]
deleteSql := fmt.Sprintf(`DELETE FROM %s WHERE rowid IN (%s)`, s.tableName, paramQs)

if _, err := s.conn.Exec(deleteSql, rowids...); err != nil {
return fmt.Errorf("deleting row from %s: %w", s.tableName, err)
}

return nil
}

func (s *sqliteStore) ForEach(fn func(rowid, timestamp int64, v []byte) error) error {
colInfo := s.getColumns()
if s == nil || s.conn == nil || colInfo == nil {
return errors.New("store is nil")
}
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved

if !colInfo.isLogstore {
return errors.New("this table type is not supported for timestamped iteration")
}

query := fmt.Sprintf(
`SELECT rowid, %s, %s FROM %s;`,
colInfo.pk,
colInfo.valueColumn,
s.tableName,
)

rows, err := s.conn.Query(query)
if err != nil {
return fmt.Errorf("issuing foreach query: %w", err)
}

defer rows.Close()

for rows.Next() {
var rowid int64
var timestamp int64
var result string
if err := rows.Scan(&rowid, &timestamp, &result); err != nil {
return fmt.Errorf("scanning foreach query: %w", err)
}

if err := fn(rowid, timestamp, []byte(result)); err != nil {
return fmt.Errorf("caller error during foreach iteration: %w", err)
}
}

return nil
}

// Write implements the io.Writer interface, allowing sqliteStore to be used as
// a logging backend via multislogger handler
func (s *sqliteStore) Write(p []byte) (n int, err error) {
if s.readOnly {
return 0, errors.New("cannot perform write with RO connection")
}

colInfo := s.getColumns()
if s == nil || s.conn == nil || colInfo == nil {
return 0, errors.New("store is nil")
}

if !colInfo.isLogstore {
return 0, errors.New("this table type is not supported for timestamped logging")
}

timestamp := time.Now().Unix()
if err := s.AppendValue(timestamp, p); err != nil {
return 0, err
}

return len(p), nil
}
45 changes: 45 additions & 0 deletions ee/agent/storage/sqlite/logstore_sqlite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package agentsqlite

import (
"context"
"encoding/json"
"fmt"
"testing"
"time"

"github.com/stretchr/testify/require"
)

func TestAppendAndIterateValues(t *testing.T) {
t.Parallel()

testRootDir := t.TempDir()

s, err := OpenRW(context.TODO(), testRootDir, WatchdogLogStore)
require.NoError(t, err, "creating test store")

startTime := time.Now()
expectedLogCount := 5
for i := 0; i < expectedLogCount; i++ {
currTime := startTime.Add(time.Duration(i) * time.Minute)
logEntry := fmt.Sprintf(`{"time":"%s", "msg":"testMessage%d"}`, currTime.Format(time.RFC3339), i)
require.NoError(t, s.AppendValue(currTime.Unix(), []byte(logEntry)), "expected no error appending value row")
}

logsSeen := 0
err = s.ForEach(func(rowid, timestamp int64, v []byte) error {
logRecord := make(map[string]any)

require.NoError(t, json.Unmarshal(v, &logRecord), "expected to be able to unmarshal row value")
expectedTime := startTime.Add(time.Duration(logsSeen) * time.Minute)
require.Equal(t, expectedTime.Unix(), timestamp, "expected log timestamp to match")

logsSeen++
return nil
})

require.NoError(t, err, "expected no error iterating over new logs")
require.Equal(t, expectedLogCount, logsSeen, "did not see expected count of logs during iteration")

require.NoError(t, s.Close())
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DROP TABLE IF EXISTS watchdog_logs;
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CREATE TABLE IF NOT EXISTS watchdog_logs (
timestamp INT NOT NULL,
log TEXT
);
4 changes: 4 additions & 0 deletions ee/agent/types/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,8 @@ type Flags interface {

// LocalDevelopmentPath points to a local build of launcher to use instead of the one selected from the autoupdate library
LocalDevelopmentPath() string

// LauncherWatchdogEnabled controls whether launcher installs/runs, or stops/removes the launcher watchdog service
SetLauncherWatchdogEnabled(enabled bool) error
LauncherWatchdogEnabled() bool
}
6 changes: 5 additions & 1 deletion ee/agent/types/keyvalue_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,14 @@ type GetterSetter interface {
Setter
}

type Closer interface {
Close() error
}

// GetterCloser extends the Getter interface with a Close method.
type GetterCloser interface {
Getter
Close() error
Closer
}

// GetterUpdaterCloser groups the Get, Update, and Close methods.
Expand Down
Loading
Loading