Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Enable open telemetry tracing #1189

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ IMAGE:=descheduler:$(VERSION)
# IMAGE_GCLOUD is the image name of descheduler in the remote registry
IMAGE_GCLOUD:=$(REGISTRY)/descheduler:$(VERSION)

# CURRENT_DIR is the current dir where the Makefile exists
CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))


# TODO: upload binaries to GCS bucket
#
# In the future binaries can be uploaded to
Expand Down Expand Up @@ -129,6 +133,9 @@ gen:
./hack/update-generated-defaulters.sh
./hack/update-docs.sh

gen-docker:
$(CONTAINER_ENGINE) run --entrypoint make -it -v $(CURRENT_DIR):/go/src/sigs.k8s.io/descheduler -w /go/src/sigs.k8s.io/descheduler golang:1.20.3 gen

verify-gen:
./hack/verify-conversions.sh
./hack/verify-deep-copies.sh
Expand Down
7 changes: 7 additions & 0 deletions charts/descheduler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ deschedulerPolicy:
# maxNoOfPodsToEvictPerNamespace: 10
# ignorePvcPods: true
# evictLocalStoragePods: true
# tracing:
# collectorEndpoint: otel-collector.observability.svc.cluster.local:4317
# transportCert: ""
# serviceName: ""
# serviceNamespace: ""
# sampleRate: 1.0
# fallbackToNoOpProviderOnError: true
strategies:
RemoveDuplicates:
enabled: true
Expand Down
11 changes: 10 additions & 1 deletion cmd/descheduler/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"sigs.k8s.io/descheduler/pkg/apis/componentconfig"
"sigs.k8s.io/descheduler/pkg/apis/componentconfig/v1alpha1"
deschedulerscheme "sigs.k8s.io/descheduler/pkg/descheduler/scheme"
"sigs.k8s.io/descheduler/pkg/tracing"
)

const (
Expand Down Expand Up @@ -74,7 +75,9 @@ func newDefaultComponentConfig() (*componentconfig.DeschedulerConfiguration, err
},
}
deschedulerscheme.Scheme.Default(&versionedCfg)
cfg := componentconfig.DeschedulerConfiguration{}
cfg := componentconfig.DeschedulerConfiguration{
Tracing: componentconfig.TracingConfiguration{},
}
if err := deschedulerscheme.Scheme.Convert(&versionedCfg, &cfg, nil); err != nil {
return nil, err
}
Expand All @@ -92,6 +95,12 @@ func (rs *DeschedulerServer) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&rs.PolicyConfigFile, "policy-config-file", rs.PolicyConfigFile, "File with descheduler policy configuration.")
fs.BoolVar(&rs.DryRun, "dry-run", rs.DryRun, "Execute descheduler in dry run mode.")
fs.BoolVar(&rs.DisableMetrics, "disable-metrics", rs.DisableMetrics, "Disables metrics. The metrics are by default served through https://localhost:10258/metrics. Secure address, resp. port can be changed through --bind-address, resp. --secure-port flags.")
fs.StringVar(&rs.Tracing.CollectorEndpoint, "otel-collector-endpoint", "", "Set this flag to the OpenTelemetry Collector Service Address")
fs.StringVar(&rs.Tracing.TransportCert, "otel-transport-ca-cert", "", "Path of the CA Cert that can be used to generate the client Certificate for establishing secure connection to the OTEL in gRPC mode")
fs.StringVar(&rs.Tracing.ServiceName, "otel-service-name", tracing.DefaultServiceName, "OTEL Trace name to be used with the resources")
fs.StringVar(&rs.Tracing.ServiceNamespace, "otel-trace-namespace", "", "OTEL Trace namespace to be used with the resources")
fs.Float64Var(&rs.Tracing.SampleRate, "otel-sample-rate", 1.0, "Sample rate to collect the Traces")
fs.BoolVar(&rs.Tracing.FallbackToNoOpProviderOnError, "otel-fallback-no-op-on-error", false, "Fallback to NoOp Tracer in case of error")

componentbaseoptions.BindLeaderElectionFlags(&rs.LeaderElection, fs)

Expand Down
6 changes: 6 additions & 0 deletions cmd/descheduler/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (

"sigs.k8s.io/descheduler/cmd/descheduler/app/options"
"sigs.k8s.io/descheduler/pkg/descheduler"
"sigs.k8s.io/descheduler/pkg/tracing"

"github.com/spf13/cobra"

Expand Down Expand Up @@ -112,6 +113,11 @@ func NewDeschedulerCommand(out io.Writer) *cobra.Command {
}

func Run(ctx context.Context, rs *options.DeschedulerServer) error {
err := tracing.NewTracerProvider(ctx, rs.Tracing.CollectorEndpoint, rs.Tracing.TransportCert, rs.Tracing.ServiceName, rs.Tracing.ServiceNamespace, rs.Tracing.SampleRate, rs.Tracing.FallbackToNoOpProviderOnError)
if err != nil {
return err
}
defer tracing.Shutdown(ctx)
// increase the fake watch channel so the dry-run mode can be run
// over a cluster with thousands of pods
watch.DefaultChanSize = 100000
Expand Down
6 changes: 6 additions & 0 deletions docs/cli/descheduler.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ descheduler [flags]
--leader-elect-resource-namespace string The namespace of resource object that is used for locking during leader election. (default "kube-system")
--leader-elect-retry-period duration The duration the clients should wait between attempting acquisition and renewal of a leadership. This is only applicable if leader election is enabled. (default 26s)
--logging-format string Sets the log format. Permitted formats: "text", "json". Non-default formats don't honor these flags: --add-dir-header, --alsologtostderr, --log-backtrace-at, --log_dir, --log_file, --log_file_max_size, --logtostderr, --skip-headers, --skip-log-headers, --stderrthreshold, --log-flush-frequency.\nNon-default choices are currently alpha and subject to change without warning. (default "text")
--otel-collector-endpoint string Set this flag to the OpenTelemetry Collector Service Address
--otel-fallback-no-op-on-error Fallback to NoOp Tracer in case of error
--otel-sample-rate float Sample rate to collect the Traces (default 1)
--otel-service-name string OTEL Trace name to be used with the resources (default "descheduler")
--otel-trace-namespace string OTEL Trace namespace to be used with the resources
--otel-transport-ca-cert string Path of the CA Cert that can be used to generate the client Certificate for establishing secure connection to the OTEL in gRPC mode
--permit-address-sharing If true, SO_REUSEADDR will be used when binding the port. This allows binding to wildcard IPs like 0.0.0.0 and specific IPs in parallel, and it avoids waiting for the kernel to release sockets in TIME_WAIT state. [default=false]
--permit-port-sharing If true, SO_REUSEPORT will be used when binding the port, which allows more than one instance to bind on the same address and port. [default=false]
--policy-config-file string File with descheduler policy configuration.
Expand Down
12 changes: 6 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ require (
github.com/google/go-cmp v0.5.9
github.com/spf13/cobra v1.6.0
github.com/spf13/pflag v1.0.5
go.opentelemetry.io/otel v1.10.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0
go.opentelemetry.io/otel/sdk v1.10.0
go.opentelemetry.io/otel/trace v1.10.0
google.golang.org/grpc v1.51.0
k8s.io/api v0.27.0
k8s.io/apimachinery v0.27.0
k8s.io/apiserver v0.27.0
Expand Down Expand Up @@ -75,13 +81,8 @@ require (
go.etcd.io/etcd/client/v3 v3.5.7 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.35.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.35.1 // indirect
go.opentelemetry.io/otel v1.10.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.10.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.10.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.10.0 // indirect
go.opentelemetry.io/otel/metric v0.31.0 // indirect
go.opentelemetry.io/otel/sdk v1.10.0 // indirect
go.opentelemetry.io/otel/trace v1.10.0 // indirect
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
Expand All @@ -98,7 +99,6 @@ require (
golang.org/x/tools v0.7.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21 // indirect
google.golang.org/grpc v1.51.0 // indirect
google.golang.org/protobuf v1.28.1 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
Expand Down
25 changes: 25 additions & 0 deletions pkg/apis/componentconfig/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ type DeschedulerConfiguration struct {
// IgnorePVCPods sets whether PVC pods should be allowed to be evicted
IgnorePVCPods bool

// Tracing specifies the options for tracing.
Tracing TracingConfiguration

// LeaderElection starts Deployment using leader election loop
LeaderElection componentbaseconfig.LeaderElectionConfiguration

Expand All @@ -66,3 +69,25 @@ type DeschedulerConfiguration struct {
// Refer to [ClientConnection](https://pkg.go.dev/k8s.io/kubernetes/pkg/apis/componentconfig#ClientConnectionConfiguration) for more information.
ClientConnection componentbaseconfig.ClientConnectionConfiguration
}

type TracingConfiguration struct {
// CollectorEndpoint is the address of the OpenTelemetry collector.
// If not specified, tracing will be used NoopTraceProvider.
CollectorEndpoint string
// TransportCert is the path to the certificate file for the OpenTelemetry collector.
// If not specified, provider will start in insecure mode.
TransportCert string
// ServiceName is the name of the service to be used in the OpenTelemetry collector.
// If not specified, the default value is "descheduler".
ServiceName string
// ServiceNamespace is the namespace of the service to be used in the OpenTelemetry collector.
// If not specified, tracing will be used default namespace.
ServiceNamespace string
// SampleRate is used to configure the sample rate of the OTEL trace collection. This value will
// be used as the Base value with sample ratio. A value >= 1.0 will sample everything and < 0 will
// not sample anything. Everything else is a percentage value.
SampleRate float64
// FallbackToNoOpProviderOnError can be set in case if you want your trace provider to fallback to
// no op provider in case if the configured end point based provider can't be setup.
FallbackToNoOpProviderOnError bool
}
25 changes: 25 additions & 0 deletions pkg/apis/componentconfig/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ type DeschedulerConfiguration struct {
// IgnorePVCPods sets whether PVC pods should be allowed to be evicted
IgnorePVCPods bool `json:"ignorePvcPods,omitempty"`

// Tracing is used to setup the required OTEL tracing configuration
Tracing TracingConfiguration `json:"tracing,omitempty"`

// LeaderElection starts Deployment using leader election loop
LeaderElection componentbaseconfig.LeaderElectionConfiguration `json:"leaderElection,omitempty"`

Expand All @@ -66,3 +69,25 @@ type DeschedulerConfiguration struct {
// Refer to [ClientConnection](https://pkg.go.dev/k8s.io/kubernetes/pkg/apis/componentconfig#ClientConnectionConfiguration) for more information.
ClientConnection componentbaseconfig.ClientConnectionConfiguration `json:"clientConnection,omitempty"`
}

type TracingConfiguration struct {
// CollectorEndpoint is the address of the OpenTelemetry collector.
// If not specified, tracing will be used NoopTraceProvider.
CollectorEndpoint string `json:"collectorEndpoint"`
// TransportCert is the path to the certificate file for the OpenTelemetry collector.
// If not specified, provider will start in insecure mode.
TransportCert string `json:"transportCert,omitempty"`
// ServiceName is the name of the service to be used in the OpenTelemetry collector.
// If not specified, the default value is "descheduler".
ServiceName string `json:"serviceName,omitempty"`
// ServiceNamespace is the namespace of the service to be used in the OpenTelemetry collector.
// If not specified, tracing will be used default namespace.
ServiceNamespace string `json:"serviceNamespace,omitempty"`
// SampleRate is used to configure the sample rate of the OTEL trace collection. This value will
// be used as the Base value with sample ratio. A value >= 1.0 will sample everything and < 0 will
// not sample anything. Everything else is a percentage value.
SampleRate float64 `json:"sampleRate"`
// FallbackToNoOpProviderOnError can be set in case if you want your trace provider to fallback to
// no op provider in case if the configured end point based provider can't be setup.
FallbackToNoOpProviderOnError bool `json:"fallbackToNoOpProviderOnError"`
}
46 changes: 46 additions & 0 deletions pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions pkg/apis/componentconfig/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading