lightningnetwork · ziggie1984 · Sep 24, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/config.go b/config.go
@@ -352,12 +352,7 @@ type Config struct {
 
 	DebugLevel string `short:"d" long:"debuglevel" description:"Logging level for all subsystems {trace, debug, info, warn, error, critical} -- You may also specify <global-level>,<subsystem>=<level>,<subsystem2>=<level>,... to set the log level for individual subsystems -- Use show to list available subsystems"`
 
-	CPUProfile string `long:"cpuprofile" description:"Write CPU profile to the specified file"`
-
-	Profile string `long:"profile" description:"Enable HTTP profiling on either a port or host:port"`
-
-	BlockingProfile int `long:"blockingprofile" description:"Used to enable a blocking profile to be served on the profiling port. This takes a value from 0 to 1, with 1 including every blocking event, and 0 including no events."`
-	MutexProfile    int `long:"mutexprofile" description:"Used to Enable a mutex profile to be served on the profiling port. This takes a value from 0 to 1, with 1 including every mutex event, and 0 including no events."`
+	Pprof *lncfg.Pprof `group:"Pprof" namespace:"pprof"`
 
 	UnsafeDisconnect   bool   `long:"unsafe-disconnect" description:"DEPRECATED: Allows the rpcserver to intentionally disconnect from peers with open channels. THIS FLAG WILL BE REMOVED IN 0.10.0" hidden:"true"`
 	UnsafeReplay       bool   `long:"unsafe-replay" description:"Causes a link to replay the adds on its commitment txn after starting up, this enables testing of the sphinx replay logic."`
@@ -821,7 +816,8 @@ func LoadConfig(interceptor signal.Interceptor) (*Config, error) {
 	cleanCfg, err := ValidateConfig(
 		cfg, interceptor, fileParser, flagParser,
 	)
-	if usageErr, ok := err.(*usageError); ok {
+	var usageErr *lncfg.UsageError
+	if errors.As(err, &usageErr) {
 		// The logging system might not yet be initialized, so we also
 		// write to stderr to make sure the error appears somewhere.
 		_, _ = fmt.Fprintln(os.Stderr, usageMessage)
@@ -830,9 +826,9 @@ func LoadConfig(interceptor signal.Interceptor) (*Config, error) {
 		// The log subsystem might not yet be initialized. But we still
 		// try to log the error there since some packaging solutions
 		// might only look at the log and not stdout/stderr.
-		ltndLog.Warnf("Error validating config: %v", usageErr.err)
+		ltndLog.Warnf("Error validating config: %v", err)
 
-		return nil, usageErr.err
+		return nil, err
 	}
 	if err != nil {
 		// The log subsystem might not yet be initialized. But we still
@@ -856,18 +852,6 @@ func LoadConfig(interceptor signal.Interceptor) (*Config, error) {
 	return cleanCfg, nil
 }
 
-// usageError is an error type that signals a problem with the supplied flags.
-type usageError struct {
-	err error
-}
-
-// Error returns the error string.
-//
-// NOTE: This is part of the error interface.
-func (u *usageError) Error() string {
-	return u.err.Error()
-}
-
 // ValidateConfig check the given configuration to be sane. This makes sure no
 // illegal values or combination of values are set. All file system paths are
 // normalized. The cleaned up config is returned on success.
@@ -1347,31 +1331,6 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser,
 		cfg.Autopilot.MaxChannelSize = int64(MaxFundingAmount)
 	}
 
-	// Validate profile port or host:port.
-	if cfg.Profile != "" {
-		str := "%s: The profile port must be between 1024 and 65535"
-
-		// Try to parse Profile as a host:port.
-		_, hostPort, err := net.SplitHostPort(cfg.Profile)
-		if err == nil {
-			// Determine if the port is valid.
-			profilePort, err := strconv.Atoi(hostPort)
-			if err != nil || profilePort < 1024 || profilePort > 65535 {
-				return nil, &usageError{mkErr(str)}
-			}
-		} else {
-			// Try to parse Profile as a port.
-			profilePort, err := strconv.Atoi(cfg.Profile)
-			if err != nil || profilePort < 1024 || profilePort > 65535 {
-				return nil, &usageError{mkErr(str)}
-			}
-
-			// Since the user just set a port, we will serve debugging
-			// information over localhost.
-			cfg.Profile = net.JoinHostPort("127.0.0.1", cfg.Profile)
-		}
-	}
-
 	// We'll now construct the network directory which will be where we
 	// store all the data specific to this chain/network.
 	cfg.networkDir = filepath.Join(
@@ -1469,7 +1428,7 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser,
 	err = build.ParseAndSetDebugLevels(cfg.DebugLevel, cfg.LogWriter)
 	if err != nil {
 		str := "error parsing debug level: %v"
-		return nil, &usageError{mkErr(str, err)}
+		return nil, &lncfg.UsageError{Err: mkErr(str, err)}
 	}
 
 	// At least one RPCListener is required. So listen on localhost per
@@ -1714,6 +1673,7 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser,
 		cfg.Htlcswitch,
 		cfg.Invoices,
 		cfg.Routing,
+		cfg.Pprof,
 	)
 	if err != nil {
 		return nil, err

diff --git a/docs/debugging_lnd.md b/docs/debugging_lnd.md
@@ -1,7 +1,7 @@
 # Table of Contents
 1. [Overview](#overview)
 1. [Debug Logging](#debug-logging)
-1. [Capturing pprof data with `lnd`](#capturing-pprof-data-with-lnd)
+1. [LND's built-in profiler](#built-in-profiler-in-lnd)
 
 ## Overview
 
@@ -12,36 +12,188 @@ data ahead of time.
 
 ## Debug Logging
 
-You can enable debug logging in `lnd` by passing the `--debuglevel` flag. For
-example, to increase the log level from `info` to `debug`:
+LND supports different logging levels and you can also specify different logging
+levels per subsystem. This makes it easy to focus on a particular subsystem
+without clogging up the logs with a lot of noise. One can either set the logging
+in the lnd.conf file or pass the flag `--debuglevel` with the specified level 
+when starting lnd.
+
+LND supports the following logging levels (see [log.go](/build/log.go) and
+[sample-lnd.conf](/sample-lnd.conf) for more info):
+
+- `trace`
+- `debug`
+- `info`
+- `warn`
+- `error`
+- `critical`
+- `off`
+
+LND is composed of many subsystems, those subsystems can be listed either by
+setting the starting flag `--debuglevel` or by using the lncli program.
+
+Show all subsystems:
 
 ```shell
-$  lnd --debuglevel=debug
+$  lnd --debuglevel=show
+$  lncli debuglevel --show
 ```
+For more details see [log.go](/log.go).
 
 You may also specify logging per-subsystem, like this:
 
 ```shell
 $  lnd --debuglevel=<subsystem>=<level>,<subsystem2>=<level>,...
+$  lncli debuglevel --level=<subsystem>=<level>,<subsystem2>=<level>,...
+```
+The default global logging level is `info`. So if one wants to change the
+global logging level and in addition also set a more detailed logging for a 
+particular subsystem the command would look like this (using `HSWC` (htlcswitch)
+as an example subsystem):
+
+```shell
+$ lnd --debuglevel=critical,HSWC=debug
+$ lncli debuglevel --level=critical,HSWC=debug
+```
+The subsystem names are case-sensitive and must be all uppercase.
+
+To identify the subsystems defined by an abbreviated name, you can search for 
+the abbrieviation in the [log.go](/log.go) file. Each subsystem 
+declares a `btclog.Logger` instance locally which is then assigned via the
+`UseLogger` function call in the `SetupLoggers` function.
+
+Example HSWC:
+
+For the `HSWC` subsystem a new sublogger is injected into the htlcswitch package
+via the `UseLogger` function call in the `SetupLoggers` function. So the HSWC
+subsystem handles the logging in the htlcswitch package.
+
+```go
+ AddSubLogger(root, "HSWC", interceptor, htlcswitch.UseLogger)
+```
+
+Caution: Some logger subsystems are overwritten during the instanziation. An
+example here is the `neutrino/query` package which instead of using the `BTCN`
+prefix is overwritten by the `LNWL` subsystem.
+
+Moreover when using the `lncli` command the return value will provide the 
+updated list of all subsystems and their associated logging levels. This makes
+it easy to get an overview of the corrent logging level for the whole system.
+
+Example:
+
+```shell
+$ lncli debuglevel --level=critical,HSWC=debug
+{
+    "sub_systems": "ARPC=INF, ATPL=INF, BLPT=INF, BRAR=INF, BTCN=INF, BTWL=INF, CHAC=INF, CHBU=INF, CHCL=INF, CHDB=INF, CHFD=INF, CHFT=INF, CHNF=INF, CHRE=INF, CLUS=INF, CMGR=INF, CNCT=INF, CNFG=INF, CRTR=INF, DISC=INF, DRPC=INF, FNDG=INF, GRPH=INF, HLCK=INF, HSWC=DBG, INVC=INF, IRPC=INF, LNWL=INF, LTND=INF, NANN=INF, NRPC=INF, NTFN=INF, NTFR=INF, PEER=INF, PRNF=INF, PROM=INF, PRPC=INF, RPCP=INF, RPCS=INF, RPWL=INF, RRPC=INF, SGNR=INF, SPHX=INF, SRVR=INF, SWPR=INF, TORC=INF, UTXN=INF, VRPC=INF, WLKT=INF, WTCL=INF, WTWR=INF"
+}
 ```
 
-## Capturing pprof data with `lnd`
 
-`lnd` has a built-in feature which allows you to capture profiling data at
+## Built-in profiler in LND
+
+`LND` has a built-in feature which allows you to capture profiling data at
 runtime using [pprof](https://golang.org/pkg/runtime/pprof/), a profiler for
-Go. The profiler has negligible performance overhead during normal operations
-(unless you have explicitly enabled CPU profiling).
+Go. It is recommended to enable the profiling server so that an analyis can be
+triggered during runtime. There is only little overhead in enabling this 
+feature, because profiling is only started when calling the server endpoints.
+However LND also allows to specify a cpu profile file via the `cpuprofile` flag 
+which triggers a cpu profile when LND starts and stops it when LND shuts down. 
+This is only recommended for debugging purposes, because the overhead is much 
+higher. To enable the profile server, start `lnd` with the `--profile` option 
+using a free port. As soon as the server is up different profiles can be fetched
+from the `debug/pprof` endpoint using either the web interface or for example 
+`curl`.
 
-To enable this ability, start `lnd` with the `--profile` option using a free port.
+Example port `9736` is used for the profil server in the following examples.
 
 ```shell
 $  lnd --profile=9736
 ```
 
-Now, with `lnd` running, you can use the pprof endpoint on port 9736 to collect
-runtime profiling data. You can fetch this data using `curl` like so:
+### Different types of profiles
+
+#### CPU profile
+
+A cpu profile can be used to analyze the CPU usage of the program. When
+obtaining it via the profile http endpoint you can specify the time duration as
+a query parameter.
+
+```shell
+$ curl http://localhost:9736/debug/pprof/profile?seconds=10 > cpu.prof
+```
+#### Goroutine profile
+
+The goroutine profile is very useful when analyzing deadlocks and lock 
+contention. It can be obtained via the web inteface or the following endpoint:
 
 ```shell
-$  curl http://localhost:9736/debug/pprof/goroutine?debug=1
-...
+$ curl http://localhost:9736/debug/pprof/goroutine?debug=2 > goroutine.prof
 ```
+The query parameter `debug=2` is optional but recommended and referes to the 
+format of the output file. Only this forrmat has the necessary information to 
+identify goroutines deadlocks. Otherwise `go tool pprof` needs to be used to
+visualize the data and interpret the results.
+
+#### Heap profile
+
+The heap profile is useful to analyze memory allocations. It can be obtained via
+the following endpoint:
+
+```shell
+$ curl http://localhost:9736/debug/pprof/heap > heap.prof
+```
+The documentation of the pprof packgae states that also a gc can be triggered
+before obtaining the heap profile. This is done by setting the gc query
+parameter. `gc=1`
+
+#### Other profiles
+
+There are several other options available like a mutex profile or a block 
+profile which gives insights into contention and bottlenecks of your program.
+The web interface lists all the available profiles/endpoints which can be
+obtained.
+
+However mutex and block profiling need to be enabled separately by setting the
+sampling rate via the config values `BlockingProfile` and `MutexProfile`. They
+are off by default (0). These values represent sampling rates meaning that a value of
+`1` will record every event leading to a significant overhead whereas a sample
+rate of `n` will only record 1 out of nth events decreasing the aggressiveness
+of the profiler.
+
+Fetching the block and mutex profile:
+
+```shell
+
+$ curl http://localhost:9736/debug/pprof/mutex?debug=2
+
+$ curl http://localhost:9736/debug/pprof/block?debug=2
+
+
+```
+
+You can also fetch the full programm command how you started LND which can
+provide details which flags were used when starting LND.
+
+```shell
+$ curl http://localhost:9736/debug/pprof/cmdline > cmdline.prof
+```
+
+There are also other endpoints available see the [pprof documentation](https://golang.org/pkg/runtime/pprof/) for more details.
+
+
+
+#### Visualizing the profile dumps
+
+It can be hard to make sense of the profile dumps by just looking at them 
+therefore the Golang ecosystem provides tools to analyze those profile dumps 
+either via the terminal or by visualizing them. One of the tools is `go tool pprof`.
+
+Assuming the profile was fetched via `curl` as in the examples above a nice
+svg visualization can be generated for the cpu profile like this:
+
+```shell
+$ go tool pprof -svg cpu.prof > cpu.svg
+```
+Details how to interpret these visualizations can be found in the [pprof documentation](https://github.com/google/pprof/blob/main/doc/README.md#interpreting-the-callgraph).
+
diff --git a/lncfg/error.go b/lncfg/error.go
@@ -0,0 +1,25 @@
+package lncfg
+
+import "fmt"
+
+// UsageError is an error type that signals a problem with the supplied flags.
+type UsageError struct {
+	Err error
+}
+
+// Error returns the error string.
+//
+// NOTE: This is part of the error interface.
+func (u *UsageError) Error() string {
+	return u.Err.Error()
+}
+
+// Unwrap returns the underlying error.
+func (u *UsageError) Unwrap() error {
+	return u.Err
+}
+
+// mkErr creates a new error from a string.
+func mkErr(format string, args ...interface{}) error {
+	return fmt.Errorf(format, args...)
+}