Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detail LND's debug procedures. #9138

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 7 additions & 47 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,12 +352,7 @@ type Config struct {

DebugLevel string `short:"d" long:"debuglevel" description:"Logging level for all subsystems {trace, debug, info, warn, error, critical} -- You may also specify <global-level>,<subsystem>=<level>,<subsystem2>=<level>,... to set the log level for individual subsystems -- Use show to list available subsystems"`

CPUProfile string `long:"cpuprofile" description:"Write CPU profile to the specified file"`

Profile string `long:"profile" description:"Enable HTTP profiling on either a port or host:port"`

BlockingProfile int `long:"blockingprofile" description:"Used to enable a blocking profile to be served on the profiling port. This takes a value from 0 to 1, with 1 including every blocking event, and 0 including no events."`
MutexProfile int `long:"mutexprofile" description:"Used to Enable a mutex profile to be served on the profiling port. This takes a value from 0 to 1, with 1 including every mutex event, and 0 including no events."`
Pprof *lncfg.Pprof `group:"Pprof" namespace:"pprof"`

UnsafeDisconnect bool `long:"unsafe-disconnect" description:"DEPRECATED: Allows the rpcserver to intentionally disconnect from peers with open channels. THIS FLAG WILL BE REMOVED IN 0.10.0" hidden:"true"`
UnsafeReplay bool `long:"unsafe-replay" description:"Causes a link to replay the adds on its commitment txn after starting up, this enables testing of the sphinx replay logic."`
Expand Down Expand Up @@ -821,7 +816,8 @@ func LoadConfig(interceptor signal.Interceptor) (*Config, error) {
cleanCfg, err := ValidateConfig(
cfg, interceptor, fileParser, flagParser,
)
if usageErr, ok := err.(*usageError); ok {
var usageErr *lncfg.UsageError
if errors.As(err, &usageErr) {
// The logging system might not yet be initialized, so we also
// write to stderr to make sure the error appears somewhere.
_, _ = fmt.Fprintln(os.Stderr, usageMessage)
Expand All @@ -830,9 +826,9 @@ func LoadConfig(interceptor signal.Interceptor) (*Config, error) {
// The log subsystem might not yet be initialized. But we still
// try to log the error there since some packaging solutions
// might only look at the log and not stdout/stderr.
ltndLog.Warnf("Error validating config: %v", usageErr.err)
ltndLog.Warnf("Error validating config: %v", err)

return nil, usageErr.err
return nil, err
}
if err != nil {
// The log subsystem might not yet be initialized. But we still
Expand All @@ -856,18 +852,6 @@ func LoadConfig(interceptor signal.Interceptor) (*Config, error) {
return cleanCfg, nil
}

// usageError is an error type that signals a problem with the supplied flags.
type usageError struct {
err error
}

// Error returns the error string.
//
// NOTE: This is part of the error interface.
func (u *usageError) Error() string {
return u.err.Error()
}

// ValidateConfig check the given configuration to be sane. This makes sure no
// illegal values or combination of values are set. All file system paths are
// normalized. The cleaned up config is returned on success.
Expand Down Expand Up @@ -1347,31 +1331,6 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser,
cfg.Autopilot.MaxChannelSize = int64(MaxFundingAmount)
}

// Validate profile port or host:port.
if cfg.Profile != "" {
str := "%s: The profile port must be between 1024 and 65535"

// Try to parse Profile as a host:port.
_, hostPort, err := net.SplitHostPort(cfg.Profile)
if err == nil {
// Determine if the port is valid.
profilePort, err := strconv.Atoi(hostPort)
if err != nil || profilePort < 1024 || profilePort > 65535 {
return nil, &usageError{mkErr(str)}
}
} else {
// Try to parse Profile as a port.
profilePort, err := strconv.Atoi(cfg.Profile)
if err != nil || profilePort < 1024 || profilePort > 65535 {
return nil, &usageError{mkErr(str)}
}

// Since the user just set a port, we will serve debugging
// information over localhost.
cfg.Profile = net.JoinHostPort("127.0.0.1", cfg.Profile)
}
}

// We'll now construct the network directory which will be where we
// store all the data specific to this chain/network.
cfg.networkDir = filepath.Join(
Expand Down Expand Up @@ -1469,7 +1428,7 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser,
err = build.ParseAndSetDebugLevels(cfg.DebugLevel, cfg.LogWriter)
if err != nil {
str := "error parsing debug level: %v"
return nil, &usageError{mkErr(str, err)}
return nil, &lncfg.UsageError{Err: mkErr(str, err)}
}

// At least one RPCListener is required. So listen on localhost per
Expand Down Expand Up @@ -1714,6 +1673,7 @@ func ValidateConfig(cfg Config, interceptor signal.Interceptor, fileParser,
cfg.Htlcswitch,
cfg.Invoices,
cfg.Routing,
cfg.Pprof,
)
if err != nil {
return nil, err
Expand Down
178 changes: 165 additions & 13 deletions docs/debugging_lnd.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Table of Contents
1. [Overview](#overview)
1. [Debug Logging](#debug-logging)
1. [Capturing pprof data with `lnd`](#capturing-pprof-data-with-lnd)
1. [LND's built-in profiler](#built-in-profiler-in-lnd)

## Overview

Expand All @@ -12,36 +12,188 @@ data ahead of time.

## Debug Logging

You can enable debug logging in `lnd` by passing the `--debuglevel` flag. For
example, to increase the log level from `info` to `debug`:
LND supports different logging levels and you can also specify different logging
levels per subsystem. This makes it easy to focus on a particular subsystem
without clogging up the logs with a lot of noise. One can either set the logging
in the lnd.conf file or pass the flag `--debuglevel` with the specified level
when starting lnd.

LND supports the following logging levels (see [log.go](/build/log.go) and
[sample-lnd.conf](/sample-lnd.conf) for more info):

- `trace`
- `debug`
- `info`
- `warn`
- `error`
- `critical`
- `off`

LND is composed of many subsystems, those subsystems can be listed either by
setting the starting flag `--debuglevel` or by using the lncli program.

Show all subsystems:

```shell
$ lnd --debuglevel=debug
$ lnd --debuglevel=show
$ lncli debuglevel --show
```
For more details see [log.go](/log.go).

You may also specify logging per-subsystem, like this:

```shell
$ lnd --debuglevel=<subsystem>=<level>,<subsystem2>=<level>,...
$ lncli debuglevel --level=<subsystem>=<level>,<subsystem2>=<level>,...
```
The default global logging level is `info`. So if one wants to change the
global logging level and in addition also set a more detailed logging for a
particular subsystem the command would look like this (using `HSWC` (htlcswitch)
as an example subsystem):

```shell
$ lnd --debuglevel=critical,HSWC=debug
$ lncli debuglevel --level=critical,HSWC=debug
```
The subsystem names are case-sensitive and must be all uppercase.

To identify the subsystems defined by an abbreviated name, you can search for
the abbrieviation in the [log.go](/log.go) file. Each subsystem
declares a `btclog.Logger` instance locally which is then assigned via the
`UseLogger` function call in the `SetupLoggers` function.

Example HSWC:

For the `HSWC` subsystem a new sublogger is injected into the htlcswitch package
via the `UseLogger` function call in the `SetupLoggers` function. So the HSWC
subsystem handles the logging in the htlcswitch package.

```go
AddSubLogger(root, "HSWC", interceptor, htlcswitch.UseLogger)
```

Caution: Some logger subsystems are overwritten during the instanziation. An
example here is the `neutrino/query` package which instead of using the `BTCN`
prefix is overwritten by the `LNWL` subsystem.

Moreover when using the `lncli` command the return value will provide the
updated list of all subsystems and their associated logging levels. This makes
it easy to get an overview of the corrent logging level for the whole system.

Example:

```shell
$ lncli debuglevel --level=critical,HSWC=debug
{
"sub_systems": "ARPC=INF, ATPL=INF, BLPT=INF, BRAR=INF, BTCN=INF, BTWL=INF, CHAC=INF, CHBU=INF, CHCL=INF, CHDB=INF, CHFD=INF, CHFT=INF, CHNF=INF, CHRE=INF, CLUS=INF, CMGR=INF, CNCT=INF, CNFG=INF, CRTR=INF, DISC=INF, DRPC=INF, FNDG=INF, GRPH=INF, HLCK=INF, HSWC=DBG, INVC=INF, IRPC=INF, LNWL=INF, LTND=INF, NANN=INF, NRPC=INF, NTFN=INF, NTFR=INF, PEER=INF, PRNF=INF, PROM=INF, PRPC=INF, RPCP=INF, RPCS=INF, RPWL=INF, RRPC=INF, SGNR=INF, SPHX=INF, SRVR=INF, SWPR=INF, TORC=INF, UTXN=INF, VRPC=INF, WLKT=INF, WTCL=INF, WTWR=INF"
}
```

## Capturing pprof data with `lnd`

`lnd` has a built-in feature which allows you to capture profiling data at
## Built-in profiler in LND

`LND` has a built-in feature which allows you to capture profiling data at
runtime using [pprof](https://golang.org/pkg/runtime/pprof/), a profiler for
Go. The profiler has negligible performance overhead during normal operations
(unless you have explicitly enabled CPU profiling).
Go. It is recommended to enable the profiling server so that an analyis can be
triggered during runtime. There is only little overhead in enabling this
feature, because profiling is only started when calling the server endpoints.
However LND also allows to specify a cpu profile file via the `cpuprofile` flag
which triggers a cpu profile when LND starts and stops it when LND shuts down.
This is only recommended for debugging purposes, because the overhead is much
higher. To enable the profile server, start `lnd` with the `--profile` option
using a free port. As soon as the server is up different profiles can be fetched
from the `debug/pprof` endpoint using either the web interface or for example
`curl`.

To enable this ability, start `lnd` with the `--profile` option using a free port.
Example port `9736` is used for the profil server in the following examples.

```shell
$ lnd --profile=9736
```

Now, with `lnd` running, you can use the pprof endpoint on port 9736 to collect
runtime profiling data. You can fetch this data using `curl` like so:
### Different types of profiles

#### CPU profile

A cpu profile can be used to analyze the CPU usage of the program. When
obtaining it via the profile http endpoint you can specify the time duration as
a query parameter.

```shell
$ curl http://localhost:9736/debug/pprof/profile?seconds=10 > cpu.prof
```
#### Goroutine profile

The goroutine profile is very useful when analyzing deadlocks and lock
contention. It can be obtained via the web inteface or the following endpoint:

```shell
$ curl http://localhost:9736/debug/pprof/goroutine?debug=1
...
$ curl http://localhost:9736/debug/pprof/goroutine?debug=2 > goroutine.prof
```
The query parameter `debug=2` is optional but recommended and referes to the
format of the output file. Only this forrmat has the necessary information to
identify goroutines deadlocks. Otherwise `go tool pprof` needs to be used to
visualize the data and interpret the results.

#### Heap profile

The heap profile is useful to analyze memory allocations. It can be obtained via
the following endpoint:

```shell
$ curl http://localhost:9736/debug/pprof/heap > heap.prof
```
The documentation of the pprof packgae states that also a gc can be triggered
before obtaining the heap profile. This is done by setting the gc query
parameter. `gc=1`

#### Other profiles

There are several other options available like a mutex profile or a block
profile which gives insights into contention and bottlenecks of your program.
The web interface lists all the available profiles/endpoints which can be
obtained.

However mutex and block profiling need to be enabled separately by setting the
sampling rate via the config values `BlockingProfile` and `MutexProfile`. They
are off by default (0). These values represent sampling rates meaning that a value of
`1` will record every event leading to a significant overhead whereas a sample
rate of `n` will only record 1 out of nth events decreasing the aggressiveness
of the profiler.

Fetching the block and mutex profile:

```shell

$ curl http://localhost:9736/debug/pprof/mutex?debug=2

$ curl http://localhost:9736/debug/pprof/block?debug=2


```

You can also fetch the full programm command how you started LND which can
provide details which flags were used when starting LND.

```shell
$ curl http://localhost:9736/debug/pprof/cmdline > cmdline.prof
```

There are also other endpoints available see the [pprof documentation](https://golang.org/pkg/runtime/pprof/) for more details.



#### Visualizing the profile dumps

It can be hard to make sense of the profile dumps by just looking at them
therefore the Golang ecosystem provides tools to analyze those profile dumps
either via the terminal or by visualizing them. One of the tools is `go tool pprof`.

Assuming the profile was fetched via `curl` as in the examples above a nice
svg visualization can be generated for the cpu profile like this:

```shell
$ go tool pprof -svg cpu.prof > cpu.svg
```
Details how to interpret these visualizations can be found in the [pprof documentation](https://github.com/google/pprof/blob/main/doc/README.md#interpreting-the-callgraph).

25 changes: 25 additions & 0 deletions lncfg/error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package lncfg

import "fmt"

// UsageError is an error type that signals a problem with the supplied flags.
type UsageError struct {
Err error
}

// Error returns the error string.
//
// NOTE: This is part of the error interface.
func (u *UsageError) Error() string {
return u.Err.Error()
}

// Unwrap returns the underlying error.
func (u *UsageError) Unwrap() error {
return u.Err
}

// mkErr creates a new error from a string.
func mkErr(format string, args ...interface{}) error {
return fmt.Errorf(format, args...)
}
Loading
Loading