From c7f50753b046c49b5ea9bbbb89f30cadba28b033 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Tue, 25 Jul 2023 18:02:24 +0900 Subject: [PATCH] Consolidate the katib-cert-generator to the katib-controller Signed-off-by: Yuki Iwai --- .github/workflows/publish-core-images.yaml | 2 - README.md | 1 - cmd/cert-generator/v1beta1/Dockerfile | 25 -- cmd/cert-generator/v1beta1/main.go | 42 -- cmd/katib-controller/v1beta1/main.go | 59 ++- docs/developer-guide.md | 14 +- docs/images-location.md | 11 - examples/v1beta1/argo/README.md | 1 - examples/v1beta1/kind-cluster/README.md | 1 - examples/v1beta1/tekton/README.md | 1 - go.mod | 2 - go.sum | 5 - .../cert-generator/cert-generator.yaml | 27 -- .../cert-generator/kustomization.yaml | 7 - .../components/cert-generator/rbac.yaml | 48 --- .../components/controller/controller.yaml | 14 +- .../v1beta1/components/controller/rbac.yaml | 16 + .../katib-external-db/kustomization.yaml | 5 - .../kustomization.yaml | 5 - .../katib-standalone/kustomization.yaml | 5 - pkg/apis/config/v1beta1/types.go | 11 +- pkg/apis/manager/health/health.pb.go | 2 + pkg/apis/manager/v1beta1/api.pb.go | 12 +- pkg/cert-generator/v1beta1/cert-generator.go | 35 -- .../v1beta1/{generate => }/certificate.go | 2 +- .../v1beta1/{consts => }/const.go | 11 +- pkg/cert-generator/v1beta1/generate.go | 265 ++++++++++++ .../v1beta1/generate/generate.go | 212 ---------- .../v1beta1/generate/generate_test.go | 197 --------- pkg/cert-generator/v1beta1/generate_test.go | 248 +++++++++++ pkg/controller.v1beta1/consts/const.go | 7 + pkg/webhook/v1beta1/webhook.go | 3 +- scripts/v1beta1/build.sh | 3 - scripts/v1beta1/push.sh | 3 - test/e2e/v1beta1/hack/aws/argo_workflow.py | 390 ------------------ test/e2e/v1beta1/scripts/aws/setup-katib.sh | 1 - .../v1beta1/scripts/gh-actions/build-load.sh | 1 - .../v1beta1/scripts/gh-actions/setup-katib.sh | 3 - 38 files changed, 619 insertions(+), 1078 deletions(-) delete mode 100644 cmd/cert-generator/v1beta1/Dockerfile delete mode 100644 cmd/cert-generator/v1beta1/main.go delete mode 100644 manifests/v1beta1/components/cert-generator/cert-generator.yaml delete mode 100644 manifests/v1beta1/components/cert-generator/kustomization.yaml delete mode 100644 manifests/v1beta1/components/cert-generator/rbac.yaml delete mode 100644 pkg/cert-generator/v1beta1/cert-generator.go rename pkg/cert-generator/v1beta1/{generate => }/certificate.go (98%) rename pkg/cert-generator/v1beta1/{consts => }/const.go (75%) create mode 100644 pkg/cert-generator/v1beta1/generate.go delete mode 100644 pkg/cert-generator/v1beta1/generate/generate.go delete mode 100644 pkg/cert-generator/v1beta1/generate/generate_test.go create mode 100644 pkg/cert-generator/v1beta1/generate_test.go delete mode 100644 test/e2e/v1beta1/hack/aws/argo_workflow.py diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 5a378de8ee4..cf6d64b4381 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -26,8 +26,6 @@ jobs: dockerfile: cmd/db-manager/v1beta1/Dockerfile - component-name: katib-ui dockerfile: cmd/ui/v1beta1/Dockerfile - - component-name: cert-generator - dockerfile: cmd/cert-generator/v1beta1/Dockerfile - component-name: file-metrics-collector dockerfile: cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile - component-name: tfevent-metrics-collector diff --git a/README.md b/README.md index 4c1b96d76d5..266f31353f1 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,6 @@ Make sure that all Katib components are running: $ kubectl get pods -n kubeflow NAME READY STATUS RESTARTS AGE -katib-cert-generator-rw95w 0/1 Completed 0 35s katib-controller-566595bdd8-hbxgf 1/1 Running 0 36s katib-db-manager-57cd769cdb-4g99m 1/1 Running 0 36s katib-mysql-7894994f88-5d4s5 1/1 Running 0 36s diff --git a/cmd/cert-generator/v1beta1/Dockerfile b/cmd/cert-generator/v1beta1/Dockerfile deleted file mode 100644 index 3984005a8c1..00000000000 --- a/cmd/cert-generator/v1beta1/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -# Build the Katib Cert Generator. -FROM golang:alpine AS build-env - -ARG TARGETARCH - -WORKDIR /go/src/github.com/kubeflow/katib - -# Download packages. -COPY go.mod . -COPY go.sum . -RUN go mod download -x - -# Copy sources. -COPY cmd/ cmd/ -COPY pkg/ pkg/ - -# Build the binary. -RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -a -o katib-cert-generator ./cmd/cert-generator/v1beta1 - -# Copy the cert-generator into a thin image. -FROM gcr.io/distroless/static:nonroot -WORKDIR /app -COPY --from=build-env /go/src/github.com/kubeflow/katib/katib-cert-generator /app/ -USER 65532:65532 -ENTRYPOINT ["./katib-cert-generator"] diff --git a/cmd/cert-generator/v1beta1/main.go b/cmd/cert-generator/v1beta1/main.go deleted file mode 100644 index 012b3f5d330..00000000000 --- a/cmd/cert-generator/v1beta1/main.go +++ /dev/null @@ -1,42 +0,0 @@ -/* -Copyright 2022 The Kubeflow Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "github.com/kubeflow/katib/pkg/cert-generator/v1beta1" - "k8s.io/client-go/kubernetes/scheme" - "k8s.io/klog" - "os" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/config" -) - -func main() { - kubeClient, err := client.New(config.GetConfigOrDie(), client.Options{Scheme: scheme.Scheme}) - if err != nil { - klog.Fatalf("Failed to create kube client.") - } - - cmd, err := v1beta1.NewKatibCertGeneratorCmd(kubeClient) - if err != nil { - klog.Fatalf("Failed to generate cert: %v", err) - } - - if err = cmd.Execute(); err != nil { - os.Exit(1) - } -} diff --git a/cmd/katib-controller/v1beta1/main.go b/cmd/katib-controller/v1beta1/main.go index 6376f2a0e98..de36409c010 100644 --- a/cmd/katib-controller/v1beta1/main.go +++ b/cmd/katib-controller/v1beta1/main.go @@ -35,6 +35,7 @@ import ( configv1beta1 "github.com/kubeflow/katib/pkg/apis/config/v1beta1" apis "github.com/kubeflow/katib/pkg/apis/controller" + certgenv1beta1 "github.com/kubeflow/katib/pkg/cert-generator/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" @@ -43,7 +44,10 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" ) -var scheme = runtime.NewScheme() +var ( + scheme = runtime.NewScheme() + log = logf.Log.WithName("entrypoint") +) func init() { utilruntime.Must(apis.AddToScheme(scheme)) @@ -53,15 +57,12 @@ func init() { func main() { logf.SetLogger(zap.New()) - log := logf.Log.WithName("entrypoint") var katibConfigFile string flag.StringVar(&katibConfigFile, "katib-config", "", "The katib-controller will load its initial configuration from this file. "+ "Omit this flag to use the default configuration values. ") - // TODO (andreyvelich): Currently it is not possible to set different webhook service name. - // flag.StringVar(&serviceName, "webhook-service-name", "katib-controller", "The service name which will be used in webhook") // TODO (andreyvelich): Currently is is not possible to store webhook cert in the local file system. // flag.BoolVar(&certLocalFS, "cert-localfs", false, "Store the webhook cert in local file system") @@ -122,20 +123,23 @@ func main() { os.Exit(1) } - log.Info("Registering Components.") - - // Setup all Controllers - log.Info("Setting up controller.") - if err := controller.AddToManager(mgr); err != nil { - log.Error(err, "Unable to register controllers to the manager") - os.Exit(1) + ctx := signals.SetupSignalHandler() + certsReady := make(chan struct{}) + if initConfig.CertGeneratorConfig.Enable { + cert := &certgenv1beta1.InternalCert{ + Namespace: consts.DefaultKatibNamespace, + ServiceName: initConfig.CertGeneratorConfig.ServiceName, + KubeClient: mgr.GetClient(), + CertsReady: certsReady, + } + go cert.Generate(ctx) + } else { + close(certsReady) } - log.Info("Setting up webhooks.") - if err := webhook.AddToManager(mgr, *initConfig.ControllerConfig.WebhookPort); err != nil { - log.Error(err, "Unable to register webhooks to the manager") - os.Exit(1) - } + // The setupControllers will register controllers to the manager + // after generated certs for the admission webhooks. + go setupControllers(mgr, initConfig, certsReady) log.Info("Setting up health checker.") if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { @@ -149,9 +153,28 @@ func main() { } // Start the Cmd - log.Info("Starting the Cmd.") - if err := mgr.Start(signals.SetupSignalHandler()); err != nil { + log.Info("Starting the manager.") + if err := mgr.Start(ctx); err != nil { log.Error(err, "Unable to run the manager") os.Exit(1) } } + +func setupControllers(mgr manager.Manager, config configv1beta1.InitConfig, certsReady chan struct{}) { + // The certsReady blocks to register controllers until generated certs. + <-certsReady + + log.Info("Registering Components.") + + // Setup all Controllers + log.Info("Setting up controller.") + if err := controller.AddToManager(mgr); err != nil { + log.Error(err, "Unable to register controllers to the manager") + os.Exit(1) + } + log.Info("Setting up webhooks.") + if err := webhook.AddToManager(mgr, *config.ControllerConfig.WebhookPort); err != nil { + log.Error(err, "Unable to register webhooks to the manager") + os.Exit(1) + } +} diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 88f3333f9c3..c0dc9e0adcf 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -100,23 +100,23 @@ plane CIDR source range to use the Katib webhooks ### Katib cert generator -Katib uses the custom `cert-generator` [Kubernetes Job](https://kubernetes.io/docs/concepts/workloads/controllers/job/) -to generate certificates for the webhooks. +Katib Controller has the internal `cert-generator` to generate certificates for the webhooks. -Once Katib is deployed in the Kubernetes cluster, the `cert-generator` Job follows these steps: +Once Katib is deployed in the Kubernetes cluster, the `cert-generator` follows these steps: - Generate the self-signed certificate and private key. - Create a Kubernetes Secret with the self-signed TLS certificate and private key. - Secret has the `katib-webhook-cert` name and `cert-generator` Job's + Secret has the `katib-webhook-cert` name and `cert-generator` controller Deployment's `ownerReference` to clean-up resources once Katib is uninstalled. - Once Secret is created, the Katib controller Deployment spawns the Pod, - since the controller has the `katib-webhook-cert` Secret volume. +- Save the self-signed TLS certificate and private key on local path (`/tmp/cert`). - Patch the webhooks with the `CABundle`. -You can find the `cert-generator` source code [here](../cmd/cert-generator/v1beta1). +Once the `cert-generator` finished, the Katib controller starts to register controllers such as `experiment-controller` to the manager. + +You can find the `cert-generator` source code [here](../pkg/cert-generator/v1beta1). ## Implement a new algorithm and use it in Katib diff --git a/docs/images-location.md b/docs/images-location.md index ae6321f255d..5afa11d008d 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -64,17 +64,6 @@ The following table shows images for the Dockerfile - - - docker.io/kubeflowkatib/cert-generator - - - Katib Cert Generator - - - Dockerfile - - diff --git a/examples/v1beta1/argo/README.md b/examples/v1beta1/argo/README.md index 2e9d475111e..fd320645d3f 100644 --- a/examples/v1beta1/argo/README.md +++ b/examples/v1beta1/argo/README.md @@ -96,7 +96,6 @@ Check that Katib Controller's pod was restarted: $ kubectl get pods -n kubeflow NAME READY STATUS RESTARTS AGE -katib-cert-generator-hnv6q 0/1 Completed 0 6m12s katib-controller-784994d449-9bgj9 1/1 Running 0 28s katib-db-manager-78697c7bd4-ck7l8 1/1 Running 0 6m13s katib-mysql-854cdb87c4-krcm9 1/1 Running 0 6m13s diff --git a/examples/v1beta1/kind-cluster/README.md b/examples/v1beta1/kind-cluster/README.md index ff7dd512326..81e27927045 100644 --- a/examples/v1beta1/kind-cluster/README.md +++ b/examples/v1beta1/kind-cluster/README.md @@ -27,7 +27,6 @@ If the above script was successful, Katib components will be running: $ kubectl get pods -n kubeflow NAME READY STATUS RESTARTS AGE -katib-cert-generator-tc2jt 0/1 Completed 0 67s katib-controller-566595bdd8-x7z6w 1/1 Running 0 67s katib-db-manager-57cd769cdb-x4lnz 1/1 Running 0 67s katib-mysql-7894994f88-7l8nd 1/1 Running 0 67s diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md index 8d5833a2271..7f3d3217513 100644 --- a/examples/v1beta1/tekton/README.md +++ b/examples/v1beta1/tekton/README.md @@ -101,7 +101,6 @@ Check that Katib Controller's pod was restarted: $ kubectl get pods -n kubeflow NAME READY STATUS RESTARTS AGE -katib-cert-generator-hnv6q 0/1 Completed 0 6m12s katib-controller-784994d449-9bgj9 1/1 Running 0 28s katib-db-manager-78697c7bd4-ck7l8 1/1 Running 0 6m13s katib-mysql-854cdb87c4-krcm9 1/1 Running 0 6m13s diff --git a/go.mod b/go.mod index 656e5fa773b..aa277240f8c 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,6 @@ require ( github.com/onsi/gomega v1.24.1 github.com/prometheus/client_golang v1.14.0 github.com/shirou/gopsutil/v3 v3.22.5 - github.com/spf13/cobra v1.6.0 github.com/spf13/viper v1.9.0 github.com/tidwall/gjson v1.14.1 golang.org/x/net v0.8.0 @@ -70,7 +69,6 @@ require ( github.com/google/uuid v1.3.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/imdario/mergo v0.3.12 // indirect - github.com/inconshreveable/mousetrap v1.0.1 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect diff --git a/go.sum b/go.sum index f8fdd1b11fd..7a0b1c5864c 100644 --- a/go.sum +++ b/go.sum @@ -322,7 +322,6 @@ github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfc github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= -github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -700,8 +699,6 @@ github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU= github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= -github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc= -github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/j-keck/arping v0.0.0-20160618110441-2cf9dc699c56/go.mod h1:ymszkNOg6tORTn+6F6j+Jc8TOr5osrynvN6ivFWZ2GA= github.com/jackc/chunkreader v1.0.0/go.mod h1:RT6O25fNZIuasFJRyZ4R/Y2BbhasbmZXF9QQ7T3kePo= github.com/jackc/chunkreader/v2 v2.0.0/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= @@ -1138,8 +1135,6 @@ github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHN github.com/spf13/cobra v1.1.1/go.mod h1:WnodtKOvamDL/PwE2M4iKs8aMDBZ5Q5klgD3qfVJQMI= github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo= github.com/spf13/cobra v1.2.1/go.mod h1:ExllRjgxM/piMAM+3tAZvg8fsklGAf3tPfi+i8t68Nk= -github.com/spf13/cobra v1.6.0 h1:42a0n6jwCot1pUmomAp4T7DeMD+20LFv4Q54pxLf2LI= -github.com/spf13/cobra v1.6.0/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/jwalterweatherman v1.1.0 h1:ue6voC5bR5F8YxI5S67j9i582FU4Qvo2bmqnqMYADFk= github.com/spf13/jwalterweatherman v1.1.0/go.mod h1:aNWZUN0dPAAO/Ljvb5BEdw96iTZ0EXowPYD95IqWIGo= diff --git a/manifests/v1beta1/components/cert-generator/cert-generator.yaml b/manifests/v1beta1/components/cert-generator/cert-generator.yaml deleted file mode 100644 index 3f06b26d9dd..00000000000 --- a/manifests/v1beta1/components/cert-generator/cert-generator.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: katib-cert-generator - namespace: kubeflow - labels: - katib.kubeflow.org/component: cert-generator -spec: - template: - metadata: - annotations: - sidecar.istio.io/inject: "false" - spec: - serviceAccountName: katib-cert-generator - containers: - - name: cert-generator - image: docker.io/kubeflowkatib/cert-generator - command: ["./katib-cert-generator"] - args: ["generate", "--namespace=$(KATIB_CORE_NAMESPACE)"] - env: - - name: KATIB_CORE_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - restartPolicy: Never - backoffLimit: 4 diff --git a/manifests/v1beta1/components/cert-generator/kustomization.yaml b/manifests/v1beta1/components/cert-generator/kustomization.yaml deleted file mode 100644 index f1536e80718..00000000000 --- a/manifests/v1beta1/components/cert-generator/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - cert-generator.yaml - - rbac.yaml diff --git a/manifests/v1beta1/components/cert-generator/rbac.yaml b/manifests/v1beta1/components/cert-generator/rbac.yaml deleted file mode 100644 index d53c8609a2d..00000000000 --- a/manifests/v1beta1/components/cert-generator/rbac.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: katib-cert-generator -rules: - - apiGroups: - - "" - resources: - - secrets - - services - verbs: - - get - - create - - delete - - apiGroups: - - batch - resources: - - jobs - verbs: - - get - - apiGroups: - - admissionregistration.k8s.io - resources: - - validatingwebhookconfigurations - - mutatingwebhookconfigurations - verbs: - - get - - patch ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: katib-cert-generator - namespace: kubeflow ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: katib-cert-generator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: katib-cert-generator -subjects: - - kind: ServiceAccount - name: katib-cert-generator - namespace: kubeflow diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml index c6f97b5f189..c9007efebdf 100644 --- a/manifests/v1beta1/components/controller/controller.yaml +++ b/manifests/v1beta1/components/controller/controller.yaml @@ -51,18 +51,18 @@ spec: fieldRef: fieldPath: metadata.namespace volumeMounts: - - mountPath: /tmp/cert - name: cert - readOnly: true +# - mountPath: /tmp/cert +# name: cert +# readOnly: true - mountPath: /katib-config.yaml name: katib-config subPath: katib-config.yaml readOnly: true volumes: - - name: cert - secret: - defaultMode: 420 - secretName: katib-webhook-cert +# - name: cert +# secret: +# defaultMode: 420 +# secretName: katib-webhook-cert - name: katib-config configMap: name: katib-config diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml index 68db66b5589..94afc04de15 100644 --- a/manifests/v1beta1/components/controller/rbac.yaml +++ b/manifests/v1beta1/components/controller/rbac.yaml @@ -49,6 +49,14 @@ rules: - pods/status verbs: - "get" + - apiGroups: + - "" + resources: + - secrets + verbs: + - get + - create + - delete - apiGroups: - apps resources: @@ -108,6 +116,14 @@ rules: - suggestions/finalizers verbs: - "*" + - apiGroups: + - admissionregistration.k8s.io + resources: + - validatingwebhookconfigurations + - mutatingwebhookconfigurations + verbs: + - get + - patch --- apiVersion: v1 kind: ServiceAccount diff --git a/manifests/v1beta1/installs/katib-external-db/kustomization.yaml b/manifests/v1beta1/installs/katib-external-db/kustomization.yaml index 28eb85756ba..3713b643516 100644 --- a/manifests/v1beta1/installs/katib-external-db/kustomization.yaml +++ b/manifests/v1beta1/installs/katib-external-db/kustomization.yaml @@ -13,8 +13,6 @@ resources: - ../../components/db-manager/ # Katib UI. - ../../components/ui/ - # Katib Cert Generator - - ../../components/cert-generator/ # Katib webhooks. - ../../components/webhook/ images: @@ -27,9 +25,6 @@ images: - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui newTag: latest - - name: docker.io/kubeflowkatib/cert-generator - newName: docker.io/kubeflowkatib/cert-generator - newTag: latest patchesStrategicMerge: - patches/db-manager.yaml # Modify katib-mysql-secrets with parameters for the DB. diff --git a/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml b/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml index 7dda9d5d0a3..0a93de94ade 100644 --- a/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml +++ b/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml @@ -15,8 +15,6 @@ resources: - ../../components/postgres/ # Katib UI. - ../../components/ui/ - # Katib Cert Generator - - ../../components/cert-generator/ # Katib webhooks. - ../../components/webhook/ images: @@ -29,9 +27,6 @@ images: - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui newTag: latest - - name: docker.io/kubeflowkatib/cert-generator - newName: docker.io/kubeflowkatib/cert-generator - newTag: latest patchesJson6902: - target: group: apps diff --git a/manifests/v1beta1/installs/katib-standalone/kustomization.yaml b/manifests/v1beta1/installs/katib-standalone/kustomization.yaml index cbf248d907f..990997f9d47 100644 --- a/manifests/v1beta1/installs/katib-standalone/kustomization.yaml +++ b/manifests/v1beta1/installs/katib-standalone/kustomization.yaml @@ -15,8 +15,6 @@ resources: - ../../components/mysql/ # Katib UI. - ../../components/ui/ - # Katib Cert Generator - - ../../components/cert-generator/ # Katib webhooks. - ../../components/webhook/ images: @@ -29,9 +27,6 @@ images: - name: docker.io/kubeflowkatib/katib-ui newName: docker.io/kubeflowkatib/katib-ui newTag: latest - - name: docker.io/kubeflowkatib/cert-generator - newName: docker.io/kubeflowkatib/cert-generator - newTag: latest configMapGenerator: - name: katib-config behavior: create diff --git a/pkg/apis/config/v1beta1/types.go b/pkg/apis/config/v1beta1/types.go index 24e3febff17..cfb50cf7a0c 100644 --- a/pkg/apis/config/v1beta1/types.go +++ b/pkg/apis/config/v1beta1/types.go @@ -40,10 +40,10 @@ type RuntimeConfig struct { // InitConfig is the YAML init structure in Katib config. type InitConfig struct { - ControllerConfig ControllerConfig `json:"controller,omitempty"` + ControllerConfig ControllerConfig `json:"controller,omitempty"` + CertGeneratorConfig CertGeneratorConfig `json:"certGenerator,omitempty"` // TODO: Adding a config for the following components would be nice. - // - Webhook Certs // - Katib DB // - Katib DB Manager // - Katib UI @@ -82,6 +82,13 @@ type ControllerConfig struct { LeaderElectionID string `json:"leaderElectionID,omitempty"` } +type CertGeneratorConfig struct { + Enable bool + // ServiceName indicates which service is used for the admission webhook. + // Defaults to 'katib-controller' + ServiceName string `json:"serviceName,omitempty"` +} + // SuggestionConfig is the suggestion structure in Katib config. type SuggestionConfig struct { AlgorithmName string `json:"algorithmName"` diff --git a/pkg/apis/manager/health/health.pb.go b/pkg/apis/manager/health/health.pb.go index 1f598c1ea49..80f57aa9b55 100644 --- a/pkg/apis/manager/health/health.pb.go +++ b/pkg/apis/manager/health/health.pb.go @@ -5,9 +5,11 @@ Package grpc_health_v1 is a generated protocol buffer package. It is generated from these files: + health.proto It has these top-level messages: + HealthCheckRequest HealthCheckResponse */ diff --git a/pkg/apis/manager/v1beta1/api.pb.go b/pkg/apis/manager/v1beta1/api.pb.go index b73f354974f..f2c20663e20 100644 --- a/pkg/apis/manager/v1beta1/api.pb.go +++ b/pkg/apis/manager/v1beta1/api.pb.go @@ -5,9 +5,11 @@ Package api_v1_beta1 is a generated protocol buffer package. It is generated from these files: + api.proto It has these top-level messages: + Experiment ExperimentSpec ParameterSpec @@ -1138,10 +1140,12 @@ func (m *ValidateAlgorithmSettingsRequest) GetExperiment() *Experiment { type ValidateAlgorithmSettingsReply struct { } -func (m *ValidateAlgorithmSettingsReply) Reset() { *m = ValidateAlgorithmSettingsReply{} } -func (m *ValidateAlgorithmSettingsReply) String() string { return proto.CompactTextString(m) } -func (*ValidateAlgorithmSettingsReply) ProtoMessage() {} -func (*ValidateAlgorithmSettingsReply) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{29} } +func (m *ValidateAlgorithmSettingsReply) Reset() { *m = ValidateAlgorithmSettingsReply{} } +func (m *ValidateAlgorithmSettingsReply) String() string { return proto.CompactTextString(m) } +func (*ValidateAlgorithmSettingsReply) ProtoMessage() {} +func (*ValidateAlgorithmSettingsReply) Descriptor() ([]byte, []int) { + return fileDescriptor0, []int{29} +} type GetEarlyStoppingRulesRequest struct { Experiment *Experiment `protobuf:"bytes,1,opt,name=experiment" json:"experiment,omitempty"` diff --git a/pkg/cert-generator/v1beta1/cert-generator.go b/pkg/cert-generator/v1beta1/cert-generator.go deleted file mode 100644 index c7f76e25ec9..00000000000 --- a/pkg/cert-generator/v1beta1/cert-generator.go +++ /dev/null @@ -1,35 +0,0 @@ -/* -Copyright 2022 The Kubeflow Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1beta1 - -import ( - "github.com/kubeflow/katib/pkg/cert-generator/v1beta1/consts" - "github.com/kubeflow/katib/pkg/cert-generator/v1beta1/generate" - "github.com/spf13/cobra" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// NewKatibCertGeneratorCmd sets up `katib-cert-generator` command. -func NewKatibCertGeneratorCmd(kubeClient client.Client) (*cobra.Command, error) { - cmd := &cobra.Command{ - Use: consts.JobName, - Short: consts.JobName, - Long: consts.JobName, - } - cmd.AddCommand(generate.NewGenerateCmd(kubeClient)) - return cmd, nil -} diff --git a/pkg/cert-generator/v1beta1/generate/certificate.go b/pkg/cert-generator/v1beta1/certificate.go similarity index 98% rename from pkg/cert-generator/v1beta1/generate/certificate.go rename to pkg/cert-generator/v1beta1/certificate.go index 57a3fb53490..dc091a30384 100644 --- a/pkg/cert-generator/v1beta1/generate/certificate.go +++ b/pkg/cert-generator/v1beta1/certificate.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package generate +package v1beta1 import ( "bytes" diff --git a/pkg/cert-generator/v1beta1/consts/const.go b/pkg/cert-generator/v1beta1/const.go similarity index 75% rename from pkg/cert-generator/v1beta1/consts/const.go rename to pkg/cert-generator/v1beta1/const.go index ca943deedd6..b068d15c34d 100644 --- a/pkg/cert-generator/v1beta1/consts/const.go +++ b/pkg/cert-generator/v1beta1/const.go @@ -14,11 +14,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package consts +package v1beta1 const ( - Service = "katib-controller" - JobName = "katib-cert-generator" - Secret = "katib-webhook-cert" - Webhook = "katib.kubeflow.org" + Service = "katib-controller" + Secret = "katib-webhook-cert" + Webhook = "katib.kubeflow.org" + serverKeyName = "tls.key" + serverCertName = "tls.crt" ) diff --git a/pkg/cert-generator/v1beta1/generate.go b/pkg/cert-generator/v1beta1/generate.go new file mode 100644 index 00000000000..d33c952ecc0 --- /dev/null +++ b/pkg/cert-generator/v1beta1/generate.go @@ -0,0 +1,265 @@ +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1beta1 + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "errors" + "fmt" + "math/big" + "os" + "path" + "strings" + "time" + + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" +) + +var ( + errServiceNotFound = errors.New("unable to locate controller service") + errCertCheckFail = errors.New("failed to check if certs already exist") + errCreateCertFail = errors.New("failed to create certs") + errCreateCertSecretFail = errors.New("failed to create secret embedded certs") + errSaveCertOnLocal = errors.New("failed to save certs on local") + errInjectCertError = errors.New("failed to inject certs into WebhookConfigurations") +) + +// InternalCert contains values for all certificates. +type InternalCert struct { + Namespace string + ServiceName string + KubeClient client.Client + CertsReady chan struct{} + + certs *certificates + fullServiceDomain string +} + +// Generate generates certificates for the admission webhooks. +func (o *InternalCert) Generate(ctx context.Context) { + if err := o.generate(ctx); err != nil { + klog.Errorf("Failed to generate certs: %v", err) + os.Exit(1) + } + // Close a CertsReady means start to register controllers to the manager. + close(o.CertsReady) +} + +// generate is the main logic for the cert generation. +func (o *InternalCert) generate(ctx context.Context) error { + controllerService := &corev1.Service{} + if err := o.KubeClient.Get(ctx, client.ObjectKey{Namespace: o.Namespace, Name: o.ServiceName}, controllerService); err != nil { + return fmt.Errorf("%w: %v", errServiceNotFound, err) + } + + certExist, err := o.isCertExist(ctx) + if err != nil { + return fmt.Errorf("%w: %v", errCertCheckFail, err) + } + if !certExist { + o.fullServiceDomain = strings.Join([]string{o.ServiceName, o.Namespace, "svc"}, ".") + + if err = o.createCert(); err != nil { + return fmt.Errorf("%w: %v", errCreateCertFail, err) + } + if err = o.createCertSecret(ctx); err != nil { + return fmt.Errorf("%w: %v", errCreateCertSecretFail, err) + } + } + if err = o.saveCertOnLocal(); err != nil { + return fmt.Errorf("%w: %v", errSaveCertOnLocal, err) + } + if err = o.injectCert(ctx); err != nil { + return fmt.Errorf("%w: %v", errInjectCertError, err) + } + return nil +} + +// isCertExist checks if a secret embedded certs already exists. +// For example, it will return true if the katib-controller is created with enabled leader-election +// since another controller pod will create the secret. +func (o *InternalCert) isCertExist(ctx context.Context) (bool, error) { + secret := &corev1.Secret{} + if err := o.KubeClient.Get(ctx, client.ObjectKey{Name: Secret, Namespace: o.Namespace}, secret); err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + return false, err + } + key := secret.Data[serverKeyName] + cert := secret.Data[serverCertName] + if len(key) != 0 && len(cert) != 0 { + o.certs = &certificates{ + keyPem: key, + certPem: cert, + } + return true, nil + } + return false, nil +} + +// saveCertOnLocal saves the certs on local. +func (o *InternalCert) saveCertOnLocal() error { + if err := os.MkdirAll(consts.CertDir, 0760); err != nil { + return err + } + f, err := os.Create(path.Join(consts.CertDir, serverKeyName)) + if err != nil { + return err + } + if _, err = f.Write(o.certs.keyPem); err != nil { + return err + } + f, err = os.Create(path.Join(consts.CertDir, serverCertName)) + _, err = f.Write(o.certs.certPem) + return err +} + +// createCert creates the self-signed certificate and private key. +func (o *InternalCert) createCert() error { + now := time.Now() + template := &x509.Certificate{ + SerialNumber: big.NewInt(0), + Subject: pkix.Name{ + CommonName: o.fullServiceDomain, + }, + DNSNames: []string{ + o.fullServiceDomain, + }, + NotBefore: now, + NotAfter: now.Add(24 * time.Hour * 365 * 10), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + } + + klog.Info("Generating self-signed public certificate and private key.") + rawKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + return err + } + + der, err := x509.CreateCertificate(rand.Reader, template, template, rawKey.Public(), rawKey) + if err != nil { + return err + } + if o.certs, err = encode(rawKey, der); err != nil { + return err + } + return nil +} + +// createCertSecret creates Secret embedded tls.key and tls.crt. +func (o *InternalCert) createCertSecret(ctx context.Context) error { + controller := &appsv1.Deployment{} + err := o.KubeClient.Get(ctx, client.ObjectKey{Name: consts.DefaultKatibControllerName, Namespace: o.Namespace}, controller) + if err != nil { + return err + } + + // Create secret with CA cert and server cert/key. + // Add ownerReferences to clean-up secret with controller Pod. + isController := true + webhookCertSecret := &corev1.Secret{ + TypeMeta: metav1.TypeMeta{ + Kind: "Secret", + APIVersion: corev1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: Secret, + Namespace: o.Namespace, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: appsv1.SchemeGroupVersion.String(), + Kind: "Deployment", + Controller: &isController, + Name: controller.Name, + UID: controller.UID, + }, + }, + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + serverKeyName: o.certs.keyPem, + serverCertName: o.certs.certPem, + }, + } + + oldSecret := &corev1.Secret{} + err = o.KubeClient.Get(ctx, client.ObjectKey{Namespace: o.Namespace, Name: Secret}, oldSecret) + if client.IgnoreNotFound(err) != nil { + return err + } + if err == nil { + klog.Warning("Previous secret was found and removed.") + if err = o.KubeClient.Delete(ctx, oldSecret); err != nil { + return err + } + } + + klog.Infof("Creating Secret: %q", Secret) + if err = o.KubeClient.Create(ctx, webhookCertSecret); err != nil { + return err + } + return nil +} + +// injectCert applies patch to ValidatingWebhookConfiguration and MutatingWebhookConfiguration. +func (o *InternalCert) injectCert(ctx context.Context) error { + validatingConf := &admissionregistrationv1.ValidatingWebhookConfiguration{} + if err := o.KubeClient.Get(ctx, client.ObjectKey{Name: Webhook}, validatingConf); err != nil { + return err + } + if !bytes.Equal(validatingConf.Webhooks[0].ClientConfig.CABundle, o.certs.certPem) { + newValidatingConf := validatingConf.DeepCopy() + newValidatingConf.Webhooks[0].ClientConfig.CABundle = o.certs.certPem + klog.Info("Trying to patch ValidatingWebhookConfiguration adding the caBundle.") + if err := o.KubeClient.Patch(ctx, newValidatingConf, client.MergeFrom(validatingConf)); err != nil { + klog.Errorf("Unable to patch ValidatingWebhookConfiguration %q", Webhook) + return err + } + } + + mutatingConf := &admissionregistrationv1.MutatingWebhookConfiguration{} + if err := o.KubeClient.Get(ctx, client.ObjectKey{Name: Webhook}, mutatingConf); err != nil { + return err + } + if !bytes.Equal(mutatingConf.Webhooks[0].ClientConfig.CABundle, o.certs.certPem) || + !bytes.Equal(mutatingConf.Webhooks[1].ClientConfig.CABundle, o.certs.certPem) { + newMutatingConf := mutatingConf.DeepCopy() + newMutatingConf.Webhooks[0].ClientConfig.CABundle = o.certs.certPem + newMutatingConf.Webhooks[1].ClientConfig.CABundle = o.certs.certPem + klog.Info("Trying to patch MutatingWebhookConfiguration adding the caBundle.") + if err := o.KubeClient.Patch(ctx, newMutatingConf, client.MergeFrom(mutatingConf)); err != nil { + klog.Errorf("Unable to patch MutatingWebhookConfiguration %q", Webhook) + return err + } + } + return nil +} diff --git a/pkg/cert-generator/v1beta1/generate/generate.go b/pkg/cert-generator/v1beta1/generate/generate.go deleted file mode 100644 index f0c37370038..00000000000 --- a/pkg/cert-generator/v1beta1/generate/generate.go +++ /dev/null @@ -1,212 +0,0 @@ -/* -Copyright 2022 The Kubeflow Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package generate - -import ( - "context" - "crypto/rand" - "crypto/rsa" - "crypto/x509" - "crypto/x509/pkix" - "math/big" - "strings" - "time" - - "github.com/kubeflow/katib/pkg/cert-generator/v1beta1/consts" - "github.com/spf13/cobra" - admissionregistrationv1 "k8s.io/api/admissionregistration/v1" - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// generateOptions contains values for all certificates. -type generateOptions struct { - namespace string - serviceName string - jobName string - fullServiceDomain string -} - -// NewGenerateCmd sets up `generate` subcommand. -func NewGenerateCmd(kubeClient client.Client) *cobra.Command { - o := &generateOptions{} - cmd := &cobra.Command{ - Use: "generate", - Short: "generate server cert for webhook", - Long: "generate server cert for webhook", - SilenceUsage: true, - RunE: func(cmd *cobra.Command, args []string) error { - if err := o.run(context.TODO(), kubeClient); err != nil { - return err - } - return nil - }, - } - f := cmd.Flags() - f.StringVarP(&o.namespace, "namespace", "n", "kubeflow", "set namespace") - f.StringVarP(&o.jobName, "jobName", "j", consts.JobName, "set job name") - f.StringVarP(&o.serviceName, "serviceName", "s", consts.Service, "set service name") - return cmd -} - -// run is main function for `generate` subcommand. -func (o *generateOptions) run(ctx context.Context, kubeClient client.Client) error { - controllerService := &corev1.Service{} - if err := kubeClient.Get(ctx, client.ObjectKey{Namespace: o.namespace, Name: o.serviceName}, controllerService); err != nil { - klog.Errorf("Unable to locate controller service: %s", o.serviceName) - return err - } - - o.fullServiceDomain = strings.Join([]string{o.serviceName, o.namespace, "svc"}, ".") - - keyPair, err := o.createCert() - if err != nil { - return err - } - - if err = o.createWebhookCertSecret(ctx, kubeClient, keyPair); err != nil { - return err - } - if err = o.injectCert(ctx, kubeClient, keyPair); err != nil { - return err - } - - return nil -} - -// createCert creates the self-signed certificate and private key. -func (o *generateOptions) createCert() (*certificates, error) { - now := time.Now() - template := &x509.Certificate{ - SerialNumber: big.NewInt(0), - Subject: pkix.Name{ - CommonName: o.fullServiceDomain, - }, - DNSNames: []string{ - o.fullServiceDomain, - }, - NotBefore: now, - NotAfter: now.Add(24 * time.Hour * 365 * 10), - KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - } - - klog.Info("Generating self-signed public certificate and private key.") - rawKey, err := rsa.GenerateKey(rand.Reader, 2048) - if err != nil { - return nil, err - } - - der, err := x509.CreateCertificate(rand.Reader, template, template, rawKey.Public(), rawKey) - if err != nil { - return nil, err - } - - return encode(rawKey, der) -} - -// createWebhookCertSecret creates Secret embedded tls.key and tls.crt. -func (o *generateOptions) createWebhookCertSecret(ctx context.Context, kubeClient client.Client, keyPair *certificates) error { - - certGeneratorJob := &batchv1.Job{} - if err := kubeClient.Get(ctx, client.ObjectKey{Namespace: o.namespace, Name: o.jobName}, certGeneratorJob); err != nil { - return err - } - - // Create secret with CA cert and server cert/key. - // Add ownerReferences to clean-up secret with cert generator Job. - isController := true - jobUID := certGeneratorJob.UID - webhookCertSecret := &corev1.Secret{ - TypeMeta: metav1.TypeMeta{ - Kind: "Secret", - APIVersion: "v1", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: consts.Secret, - Namespace: o.namespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "batch/v1", - Kind: "Job", - Controller: &isController, - Name: o.jobName, - UID: jobUID, - }, - }, - }, - Type: corev1.SecretTypeTLS, - Data: map[string][]byte{ - "tls.key": keyPair.keyPem, - "tls.crt": keyPair.certPem, - }, - } - - oldSecret := &corev1.Secret{} - err := kubeClient.Get(ctx, client.ObjectKey{Namespace: o.namespace, Name: consts.Secret}, oldSecret) - switch { - case err != nil && !k8serrors.IsNotFound(err): - return err - case err == nil: - klog.Warning("Previous secret was found and removed.") - if err = kubeClient.Delete(ctx, oldSecret); err != nil { - return err - } - } - - klog.Infof("Creating Secret: %s", consts.Secret) - if err = kubeClient.Create(ctx, webhookCertSecret); err != nil { - return err - } - return nil -} - -// injectCert applies patch to ValidatingWebhookConfiguration and MutatingWebhookConfiguration. -func (o *generateOptions) injectCert(ctx context.Context, kubeClient client.Client, keyPair *certificates) error { - validatingConf := &admissionregistrationv1.ValidatingWebhookConfiguration{} - if err := kubeClient.Get(ctx, client.ObjectKey{Name: consts.Webhook}, validatingConf); err != nil { - return err - } - newValidatingConf := validatingConf.DeepCopy() - newValidatingConf.Webhooks[0].ClientConfig.CABundle = keyPair.certPem - - klog.Info("Trying to patch ValidatingWebhookConfiguration adding the caBundle.") - if err := kubeClient.Patch(ctx, newValidatingConf, client.MergeFrom(validatingConf)); err != nil { - klog.Errorf("Unable to patch ValidatingWebhookConfiguration %s", consts.Webhook) - return err - } - - mutatingConf := &admissionregistrationv1.MutatingWebhookConfiguration{} - if err := kubeClient.Get(ctx, client.ObjectKey{Name: consts.Webhook}, mutatingConf); err != nil { - return err - } - newMutatingConf := mutatingConf.DeepCopy() - newMutatingConf.Webhooks[0].ClientConfig.CABundle = keyPair.certPem - newMutatingConf.Webhooks[1].ClientConfig.CABundle = keyPair.certPem - - klog.Info("Trying to patch MutatingWebhookConfiguration adding the caBundle.") - if err := kubeClient.Patch(ctx, newMutatingConf, client.MergeFrom(mutatingConf)); err != nil { - klog.Errorf("Unable to patch MutatingWebhookConfiguration %s", consts.Webhook) - return err - } - - return nil -} diff --git a/pkg/cert-generator/v1beta1/generate/generate_test.go b/pkg/cert-generator/v1beta1/generate/generate_test.go deleted file mode 100644 index e07915c74de..00000000000 --- a/pkg/cert-generator/v1beta1/generate/generate_test.go +++ /dev/null @@ -1,197 +0,0 @@ -/* -Copyright 2022 The Kubeflow Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package generate - -import ( - "log" - "strings" - "testing" - - "github.com/kubeflow/katib/pkg/cert-generator/v1beta1/consts" - admissionregistration "k8s.io/api/admissionregistration/v1" - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestGenerate(t *testing.T) { - - const testNamespace = "test" - - testGeneratorJob := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "batch/v1", - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: consts.JobName, - Namespace: testNamespace, - UID: "test", - }, - } - testValidatingWebhook := &admissionregistration.ValidatingWebhookConfiguration{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "admissionregistration.k8s.io/v1", - Kind: "ValidatingWebhookConfiguration", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: consts.Webhook, - }, - Webhooks: []admissionregistration.ValidatingWebhook{ - { - Name: strings.Join([]string{"validator.experiment", consts.Webhook}, "."), - ClientConfig: admissionregistration.WebhookClientConfig{ - CABundle: []byte("CG=="), - }, - }, - }, - } - testMutatingWebhook := &admissionregistration.MutatingWebhookConfiguration{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "admissionregistration.k8s.io/v1", - Kind: "MutatingWebhookConfiguration", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: consts.Webhook, - }, - Webhooks: []admissionregistration.MutatingWebhook{ - { - Name: strings.Join([]string{"defaulter.experiment", consts.Webhook}, "."), - ClientConfig: admissionregistration.WebhookClientConfig{ - CABundle: []byte("CG=="), - }, - }, - { - Name: strings.Join([]string{"mutator.pod", consts.Webhook}, "."), - ClientConfig: admissionregistration.WebhookClientConfig{ - CABundle: []byte("CG=="), - }, - }, - }, - } - oldWebhookCertSecret := &corev1.Secret{ - TypeMeta: metav1.TypeMeta{ - Kind: "Secret", - APIVersion: "v1", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: consts.Secret, - Namespace: testNamespace, - }, - } - testControllerService := &corev1.Service{ - TypeMeta: metav1.TypeMeta{ - Kind: "Service", - APIVersion: "v1", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: consts.Service, - Namespace: testNamespace, - }, - } - - tests := []struct { - testDescription string - err bool - objects []client.Object - }{ - { - testDescription: "Generate successfully", - err: false, - objects: []client.Object{ - testGeneratorJob, - testValidatingWebhook, - testMutatingWebhook, - testControllerService, - }, - }, - { - testDescription: "There is old Secret, katib-webhook-cert", - err: false, - objects: []client.Object{ - testGeneratorJob, - testValidatingWebhook, - testMutatingWebhook, - oldWebhookCertSecret, - testControllerService, - }, - }, - { - testDescription: "There is not Job, katib-cert-generator", - err: true, - objects: []client.Object{ - testValidatingWebhook, - testMutatingWebhook, - testControllerService, - }, - }, - { - testDescription: "There is not ValidatingWebhookConfiguration", - err: true, - objects: []client.Object{ - testGeneratorJob, - testMutatingWebhook, - testControllerService, - }, - }, - { - testDescription: "There is not MutatingWebhookConfiguration", - err: true, - objects: []client.Object{ - testGeneratorJob, - testValidatingWebhook, - testControllerService, - }, - }, - { - testDescription: "There is no Service katib-controller", - err: true, - objects: []client.Object{ - testGeneratorJob, - testMutatingWebhook, - }, - }, - } - - for _, test := range tests { - t.Run(test.testDescription, func(t *testing.T) { - if err := executeGeneratorCommand(test.objects, testNamespace); (err != nil) != test.err { - t.Errorf("expected error: %v, got: '%v'\n", test.err, err) - } - }) - } - -} - -func executeGeneratorCommand(kubeResources []client.Object, namespace string) error { - - fakeClientBuilder := fake.NewClientBuilder().WithScheme(scheme.Scheme) - if len(kubeResources) > 0 { - for _, r := range kubeResources { - fakeClientBuilder.WithObjects(r) - } - } - cmd := NewGenerateCmd(fakeClientBuilder.Build()) - if err := cmd.Flags().Set("namespace", namespace); err != nil { - log.Fatal(err) - } - - return cmd.Execute() -} diff --git a/pkg/cert-generator/v1beta1/generate_test.go b/pkg/cert-generator/v1beta1/generate_test.go new file mode 100644 index 00000000000..1bad8108041 --- /dev/null +++ b/pkg/cert-generator/v1beta1/generate_test.go @@ -0,0 +1,248 @@ +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1beta1 + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + admissionregistration "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" +) + +func TestGenerate(t *testing.T) { + const testNamespace = "test" + + controllerDeployment := &appsv1.Deployment{ + TypeMeta: metav1.TypeMeta{ + Kind: "Deployment", + APIVersion: appsv1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "katib-controller", + Namespace: testNamespace, + UID: "test", + }, + } + emptyVWebhookConfig := &admissionregistration.ValidatingWebhookConfiguration{ + TypeMeta: metav1.TypeMeta{ + APIVersion: admissionregistration.SchemeGroupVersion.String(), + Kind: "ValidatingWebhookConfiguration", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: Webhook, + }, + Webhooks: []admissionregistration.ValidatingWebhook{ + { + Name: strings.Join([]string{"validator.experiment", Webhook}, "."), + ClientConfig: admissionregistration.WebhookClientConfig{}, + }, + }, + } + emptyMWebhookConfig := &admissionregistration.MutatingWebhookConfiguration{ + TypeMeta: metav1.TypeMeta{ + APIVersion: admissionregistration.SchemeGroupVersion.String(), + Kind: "MutatingWebhookConfiguration", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: Webhook, + }, + Webhooks: []admissionregistration.MutatingWebhook{ + { + Name: strings.Join([]string{"defaulter.experiment", Webhook}, "."), + ClientConfig: admissionregistration.WebhookClientConfig{}, + }, + { + Name: strings.Join([]string{"mutator.pod", Webhook}, "."), + ClientConfig: admissionregistration.WebhookClientConfig{}, + }, + }, + } + controllerSecret := &corev1.Secret{ + TypeMeta: metav1.TypeMeta{ + Kind: "Secret", + APIVersion: corev1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: Secret, + Namespace: testNamespace, + }, + } + controllerService := &corev1.Service{ + TypeMeta: metav1.TypeMeta{ + Kind: "Service", + APIVersion: corev1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: Service, + Namespace: testNamespace, + }, + } + + tests := map[string]struct { + objects []client.Object + opts *InternalCert + wantError error + }{ + "Generate successfully": { + opts: &InternalCert{ + Namespace: testNamespace, + ServiceName: "katib-controller", + }, + objects: []client.Object{ + controllerDeployment, + emptyVWebhookConfig, + emptyMWebhookConfig, + controllerService, + }, + }, + "There is an old Secret, katib-webhook-cert": { + opts: &InternalCert{ + Namespace: testNamespace, + ServiceName: "katib-controller", + }, + objects: []client.Object{ + controllerDeployment, + emptyVWebhookConfig, + emptyMWebhookConfig, + controllerService, + controllerSecret, + }, + }, + "There is not Deployment, katib-controller": { + opts: &InternalCert{ + Namespace: testNamespace, + ServiceName: "katib-controller", + }, + objects: []client.Object{ + emptyVWebhookConfig, + emptyMWebhookConfig, + controllerService, + }, + wantError: errCreateCertSecretFail, + }, + "There is not ValidatingWebhookConfiguration": { + opts: &InternalCert{ + Namespace: testNamespace, + ServiceName: "katib-controller", + }, + objects: []client.Object{ + controllerDeployment, + emptyMWebhookConfig, + controllerService, + }, + wantError: errInjectCertError, + }, + "There is not MutatingWebhookConfiguration": { + opts: &InternalCert{ + Namespace: testNamespace, + ServiceName: "katib-controller", + }, + objects: []client.Object{ + controllerDeployment, + emptyVWebhookConfig, + controllerService, + }, + wantError: errInjectCertError, + }, + "There is no Service katib-controller": { + opts: &InternalCert{ + Namespace: testNamespace, + ServiceName: "katib-controller", + }, + objects: []client.Object{ + controllerDeployment, + emptyVWebhookConfig, + emptyMWebhookConfig, + }, + wantError: errServiceNotFound, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + if err := os.RemoveAll(consts.CertDir); err != nil { + t.Fatalf("Failed to clean up cert dir: %v", err) + } + + kc := buildFakeClient(tc.objects) + tc.opts.KubeClient = kc + err := tc.opts.generate(context.Background()) + if diff := cmp.Diff(tc.wantError, err, cmpopts.EquateErrors()); len(diff) != 0 { + t.Errorf("Unexpected error from generate() (-want,+got):\n%s", diff) + } + + if tc.wantError == nil { + secret := &corev1.Secret{} + if err = kc.Get(context.Background(), client.ObjectKey{Name: Secret, Namespace: testNamespace}, secret); err != nil { + t.Fatalf("Failed to get a controllerSecret: %v", err) + } + if !metav1.IsControlledBy(secret, controllerDeployment) { + t.Errorf("Unexpected owner for the secret: %v", secret.OwnerReferences) + } + if len(secret.Data[serverKeyName]) == 0 { + t.Errorf("Unexpected tls.key embedded in secret: %v", secret.Data) + } + if len(secret.Data[serverCertName]) == 0 { + t.Errorf("Unexpected tls.crt embedded in secret: %v", secret.Data) + } + + if _, err = os.Stat(filepath.Join(consts.CertDir, serverKeyName)); err != nil { + t.Errorf("Failed to find tls.key: %v", err) + } + if _, err = os.Stat(filepath.Join(consts.CertDir, serverCertName)); err != nil { + t.Errorf("Failed to find tls.crt: %v", err) + } + + vConfig := &admissionregistration.ValidatingWebhookConfiguration{} + if err = kc.Get(context.Background(), client.ObjectKey{Name: Webhook}, vConfig); err != nil { + t.Fatalf("Failed to get a ValidatingWebhookConfiguration: %v", err) + } + if len(vConfig.Webhooks[0].ClientConfig.CABundle) == 0 { + t.Errorf("Unexpected tls.crt embedded in ValidatingWebhookConfiguration: %v", vConfig.Webhooks) + } + + mConfig := &admissionregistration.MutatingWebhookConfiguration{} + if err = kc.Get(context.Background(), client.ObjectKey{Name: Webhook}, mConfig); err != nil { + t.Fatalf("Failed to get a MutatingWebhookConfiguration: %v", err) + } + if len(mConfig.Webhooks[0].ClientConfig.CABundle) == 0 || len(mConfig.Webhooks[1].ClientConfig.CABundle) == 0 { + t.Errorf("Unexpected tls.crt embedded in MutatingWebhookConfiguration: %v", mConfig.Webhooks) + } + } + }) + } +} + +func buildFakeClient(kubeResources []client.Object) client.Client { + fakeClientBuilder := fake.NewClientBuilder().WithScheme(scheme.Scheme) + if len(kubeResources) > 0 { + fakeClientBuilder.WithObjects(kubeResources...) + } + return fakeClientBuilder.Build() +} diff --git a/pkg/controller.v1beta1/consts/const.go b/pkg/controller.v1beta1/consts/const.go index 50f3621f1f7..6d9abb05341 100644 --- a/pkg/controller.v1beta1/consts/const.go +++ b/pkg/controller.v1beta1/consts/const.go @@ -51,6 +51,9 @@ const ( // TODO (andreyvelich): Currently is is not possible to store webhook cert in the local file system // ConfigCertLocalFS = "cert-local-filesystem" + // CertDir is the location saved certs for the webhooks. + CertDir = "/tmp/cert" + // ConfigInjectSecurityContext is the config name which indicates // if we should inject the security context into the metrics collector // sidecar. @@ -100,6 +103,8 @@ const ( DefaultKatibNamespaceEnvName = "KATIB_CORE_NAMESPACE" // DefaultKatibComposerEnvName is the default env name of katib suggestion composer DefaultKatibComposerEnvName = "KATIB_SUGGESTION_COMPOSER" + // DefaultKatibControllerNameEnvName is the env name of controller deployment's name. + DefaultKatibControllerNameEnvName = "KATIB_CONTROLLER_NAME" // DefaultKatibDBManagerServiceNamespaceEnvName is the env name of Katib DB Manager namespace DefaultKatibDBManagerServiceNamespaceEnvName = "KATIB_DB_MANAGER_SERVICE_NAMESPACE" @@ -163,6 +168,8 @@ var ( DefaultKatibNamespace = env.GetEnvOrDefault(DefaultKatibNamespaceEnvName, "kubeflow") // DefaultComposer is the default composer of katib suggestion. DefaultComposer = env.GetEnvOrDefault(DefaultKatibComposerEnvName, "General") + // DefaultKatibControllerName is the default katib-controller deployment name. + DefaultKatibControllerName = env.GetEnvOrDefault(DefaultKatibControllerNameEnvName, "katib-controller") // DefaultKatibDBManagerServiceNamespace is the default namespace of Katib DB Manager DefaultKatibDBManagerServiceNamespace = env.GetEnvOrDefault(DefaultKatibDBManagerServiceNamespaceEnvName, DefaultKatibNamespace) diff --git a/pkg/webhook/v1beta1/webhook.go b/pkg/webhook/v1beta1/webhook.go index e5ac6e607de..09ca2790fab 100644 --- a/pkg/webhook/v1beta1/webhook.go +++ b/pkg/webhook/v1beta1/webhook.go @@ -18,6 +18,7 @@ package webhook import ( "fmt" + "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -30,7 +31,7 @@ func AddToManager(mgr manager.Manager, port int) error { // Create a webhook server. hookServer := &webhook.Server{ Port: port, - CertDir: "/tmp/cert", + CertDir: consts.CertDir, } if err := mgr.Add(hookServer); err != nil { return fmt.Errorf("Add webhook server to the manager failed: %v", err) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index e511845536e..97d4b9ed039 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -64,9 +64,6 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/katib-db-manager: echo -e "\nBuilding Katib UI image...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/ui/${VERSION}/Dockerfile . -echo -e "\nBuilding Katib cert generator image...\n" -docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . - echo -e "\nBuilding file metrics collector image...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 6f0627b4081..9a6c70c546f 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -44,9 +44,6 @@ docker push "${REGISTRY}/katib-db-manager:${TAG}" echo -e "\nPushing Katib UI image...\n" docker push "${REGISTRY}/katib-ui:${TAG}" -echo -e "\nPushing Katib cert generator image...\n" -docker push "${REGISTRY}/cert-generator:${TAG}" - echo -e "\nPushing file metrics collector image...\n" docker push "${REGISTRY}/file-metrics-collector:${TAG}" diff --git a/test/e2e/v1beta1/hack/aws/argo_workflow.py b/test/e2e/v1beta1/hack/aws/argo_workflow.py deleted file mode 100644 index ffd288634b0..00000000000 --- a/test/e2e/v1beta1/hack/aws/argo_workflow.py +++ /dev/null @@ -1,390 +0,0 @@ -# Copyright 2022 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script creates Argo Workflow for the e2e Katib tests. - -from kubeflow.testing import argo_build_util - - -# Main worker image to execute Workflow. -IMAGE_WORKER = "public.ecr.aws/j1r0q0g6/kubeflow-testing:latest" -# Kaniko image to build Katib images. -IMAGE_KANIKO = "gcr.io/kaniko-project/executor:v1.0.0" - -# Volume to store test data among the Workflow tasks. -VOLUME_TEST_DATA = "kubeflow-test-volume" -# Volume mount path to store test data among the Workflow tasks. -MOUNT_PATH = "/mnt/test-data-volume" -# Volume to store GitHub token to clone repos. -VOLUME_GITHUB_TOKEN = "github-token" -# Volume to store AWS secret for the Kaniko build. -VOLUME_AWS_SECRET = "aws-secret" -# Volume to store Docker config for Kaniko build. -VOLUME_DOCKER_CONFIG = "docker-config" - -# Entrypoint for the Argo Workflow. -ENTRYPOINT = "e2e" -# The template that should always run when the Workflow is complete. -EXIT_HANDLER = "exit-handler" - -# Dict with all Katib images. -# Key - image name, Value - dockerfile location. -KATIB_IMAGES = { - "katib-controller": "cmd/katib-controller/v1beta1/Dockerfile", - "katib-db-manager": "cmd/db-manager/v1beta1/Dockerfile", - "katib-ui": "cmd/ui/v1beta1/Dockerfile", - "cert-generator": "cmd/cert-generator/v1beta1/Dockerfile", - "file-metrics-collector": "cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile", - "tfevent-metrics-collector": "cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", - "suggestion-hyperopt": "cmd/suggestion/hyperopt/v1beta1/Dockerfile", - "suggestion-skopt": "cmd/suggestion/skopt/v1beta1/Dockerfile", - "suggestion-hyperband": "cmd/suggestion/hyperband/v1beta1/Dockerfile", - "suggestion-goptuna": "cmd/suggestion/goptuna/v1beta1/Dockerfile", - "suggestion-optuna": "cmd/suggestion/optuna/v1beta1/Dockerfile", - "suggestion-pbt": "cmd/suggestion/pbt/v1beta1/Dockerfile", - "suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile", - "suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile", - "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", - "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", - "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", - "trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile", - "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", - "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", - "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", - "trial-simple-pbt": "examples/v1beta1/trial-images/simple-pbt/Dockerfile", -} - -# Dict with Katib Experiments to run during the test. -# Key - image name, Value - dockerfile location. -KATIB_EXPERIMENTS = { - "random": "examples/v1beta1/hp-tuning/random.yaml", - "grid": "examples/v1beta1/hp-tuning/grid.yaml", - "bayesianoptimization": "examples/v1beta1/hp-tuning/bayesian-optimization.yaml", - "tpe": "examples/v1beta1/hp-tuning/tpe.yaml", - "multivariate-tpe": "examples/v1beta1/hp-tuning/multivariate-tpe.yaml", - "cmaes": "examples/v1beta1/hp-tuning/cma-es.yaml", - "hyperband": "examples/v1beta1/hp-tuning/hyperband.yaml", - "pbt": "examples/v1beta1/hp-tuning/simple-pbt.yaml", - "enas": "examples/v1beta1/nas/enas-cpu.yaml", - "darts": "examples/v1beta1/nas/darts-cpu.yaml", - "pytorchjob": "examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml", - "tfjob": "examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml", - "file-metricscollector": "examples/v1beta1/metrics-collector/file-metrics-collector.yaml", - "file-metricscollector-with-json-format": "examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml", - "never-resume": "examples/v1beta1/resume-experiment/never-resume.yaml", - "from-volume-resume": "examples/v1beta1/resume-experiment/from-volume-resume.yaml", - "median-stop": "examples/v1beta1/early-stopping/median-stop.yaml", - "median-stop-with-json-format": "examples/v1beta1/early-stopping/median-stop-with-json-format.yaml", -} -# How many Experiments are running in parallel. -PARALLEL_EXECUTION = 5 - - -class WorkflowBuilder(object): - def __init__(self, workflow_name, workflow_namespace, test_dir, ecr_registry): - """WorkflowBuilder constructor. - - :param workflow_name: Argo Workflow name. - :param workflow_namespace: Argo Workflow namespace. - :param test_dir: Root directory to store all data for a particular test run. - :param ecr_registry: ECR registry to push the test images. - """ - - self.workflow_name = workflow_name - self.workflow_namespace = workflow_namespace - self.test_dir = test_dir - self.katib_dir = test_dir + "/src/github.com/kubeflow/katib" - self.manifest_dir = test_dir + "/src/github.com/kubeflow/manifests" - self.ecr_registry = ecr_registry - - def create_task_template(self, task_name, exec_image, command): - """Creates template for all the Workflow tasks. - - :param task_name: Template name for the task. - :param exec_image: Container image to execute the task. - :param command: List of container commands. - - :return: Created task template. - """ - - # Container environment variables. - # TODO (andreyvelich): Add PYTHONPATH ? - env = [ - { - "name": "AWS_ACCESS_KEY_ID", - "valueFrom": { - "secretKeyRef": { - "name": "aws-credentials", - "key": "AWS_ACCESS_KEY_ID" - } - } - }, - { - "name": "AWS_SECRET_ACCESS_KEY", - "valueFrom": { - "secretKeyRef": { - "name": "aws-credentials", - "key": "AWS_SECRET_ACCESS_KEY" - } - } - }, - { - "name": "AWS_REGION", - "value": "us-west-2" - }, - { - "name": "CLUSTER_NAME", - "value": self.workflow_name - }, - { - "name": "EKS_CLUSTER_VERSION", - "value": "1.19" - }, - { - "name": "ECR_REGISTRY", - "value": self.ecr_registry - }, - { - "name": "GIT_TOKEN", - "valueFrom": { - "secretKeyRef": { - "name": "github-token", - "key": "github_token" - } - } - }, - { - "name": "MANIFESTS_DIR", - "value": self.manifest_dir - }, - { - "name": "EXTRA_REPOS", - "value": "kubeflow/testing@HEAD;kubeflow/manifests@v1.5-branch" - }, - # Set GOPATH to test_dir because Katib repo is located under /src/github.com/kubeflow/katib - { - "name": "GOPATH", - "value": self.test_dir - } - ] - - # Container volume mounts. - volume_mounts = [ - { - "name": VOLUME_TEST_DATA, - "mountPath": MOUNT_PATH - }, - { - "name": VOLUME_GITHUB_TOKEN, - "mountPath": "/secret/github-token" - }, - { - "name": VOLUME_AWS_SECRET, - "mountPath": "/root/.aws/" - }, - { - "name": VOLUME_DOCKER_CONFIG, - "mountPath": "/kaniko/.docker/" - }, - ] - - task_template = { - "name": task_name, - # Each container can be alive for 40 minutes. - "retryStrategy": { - "limit": "3", - "retryPolicy": "Always", - "backoff": { - "duration": "1", - "factor": "2", - "maxDuration": "1m", - }, - }, - "container": { - "command": command, - "image": exec_image, - "workingDir": self.katib_dir, - "env": env, - "volumeMounts": volume_mounts, - } - } - - # Add prow env to the task template. - prow_env_dict = argo_build_util.get_prow_dict() - for k, v in prow_env_dict.items(): - task_template["container"]["env"].append({"name": k, "value": v}) - - return task_template - - def create_init_workflow(self): - """Creates initial structure for the Argo Workflow. - - :return: Initial Argo Workflow. - """ - - # Volumes which are used in Argo Workflow. - volumes = [ - { - "name": VOLUME_TEST_DATA, - "persistentVolumeClaim": { - "claimName": "nfs-external" - }, - }, - { - "name": VOLUME_GITHUB_TOKEN, - "secret": { - "secretName": VOLUME_GITHUB_TOKEN - }, - }, - { - "name": VOLUME_AWS_SECRET, - "secret": { - "secretName": VOLUME_AWS_SECRET - }, - }, - { - "name": VOLUME_DOCKER_CONFIG, - "configMap": { - "name": VOLUME_DOCKER_CONFIG - }, - }, - ] - - workflow = { - "apiVersion": "argoproj.io/v1alpha1", - "kind": "Workflow", - "metadata": { - "name": self.workflow_name, - "namespace": self.workflow_namespace, - }, - "spec": { - "entrypoint": ENTRYPOINT, - "volumes": volumes, - "templates": [ - { - "name": ENTRYPOINT, - "dag": { - "tasks": [] - } - }, - { - "name": EXIT_HANDLER, - "dag": { - "tasks": [] - } - } - ], - "onExit": EXIT_HANDLER - }, - } - - return workflow - - -def create_workflow(name, namespace, **kwargs): - """Main function which returns Argo Workflow. - - :param name: Argo Workflow name. - :param namespace: Argo Workflow namespace. - :param kwargs: Argo Workflow additional arguments. - - :return: Created Argo Workflow. - """ - - test_dir = MOUNT_PATH + "/" + name - ecr_registry = kwargs["registry"] - builder = WorkflowBuilder(name, namespace, test_dir, ecr_registry) - - # Build initial structure for the Workflow. - workflow = builder.create_init_workflow() - - # Delete AWS Cluster in the exit handler step. - delete_cluster = builder.create_task_template( - task_name="delete-cluster", - exec_image=IMAGE_WORKER, - command=[ - "/usr/local/bin/delete-eks-cluster.sh", - ] - ) - argo_build_util.add_task_to_dag(workflow, EXIT_HANDLER, delete_cluster, []) - - # Step 1. Checkout GitHub repositories. - checkout = builder.create_task_template( - task_name="checkout", - exec_image=IMAGE_WORKER, - command=[ - "/usr/local/bin/checkout.sh", - test_dir + "/src/github.com" - ] - ) - argo_build_util.add_task_to_dag(workflow, ENTRYPOINT, checkout, []) - - # Step 2.1 Build all Katib images. - depends = [] - for image, dockerfile in KATIB_IMAGES.items(): - build_image = builder.create_task_template( - task_name="build-"+image, - exec_image=IMAGE_KANIKO, - command=[ - "/kaniko/executor", - "--dockerfile={}/{}".format(builder.katib_dir, dockerfile), - "--context=dir://" + builder.katib_dir, - "--destination={}/katib/v1beta1/{}:$(PULL_PULL_SHA)".format(ecr_registry, image) - ] - ) - argo_build_util.add_task_to_dag(workflow, ENTRYPOINT, build_image, [checkout["name"]]) - depends.append(build_image["name"]) - - # Step 2.2 Create AWS cluster. - create_cluster = builder.create_task_template( - task_name="create-cluster", - exec_image=IMAGE_WORKER, - command=[ - "/usr/local/bin/create-eks-cluster.sh", - ] - ) - argo_build_util.add_task_to_dag(workflow, ENTRYPOINT, create_cluster, [checkout["name"]]) - depends.append(create_cluster["name"]) - - # Step 3. Setup Katib on AWS cluster. - setup_katib = builder.create_task_template( - task_name="setup-katib", - exec_image=IMAGE_WORKER, - command=[ - "test/e2e/v1beta1/scripts/setup-katib.sh" - ] - ) - - # Installing Katib after cluster is created and images are built. - argo_build_util.add_task_to_dag(workflow, ENTRYPOINT, setup_katib, depends) - - # Step 4. Run Katib Experiments. - depends = [setup_katib["name"]] - tmp_depends = [] - for index, (experiment, location) in enumerate(KATIB_EXPERIMENTS.items()): - run_experiment = builder.create_task_template( - task_name="run-e2e-experiment-"+experiment, - exec_image=IMAGE_WORKER, - command=[ - "test/e2e/v1beta1/scripts/run-e2e-experiment.sh", - location - ] - ) - argo_build_util.add_task_to_dag(workflow, ENTRYPOINT, run_experiment, depends) - tmp_depends.append(run_experiment["name"]) - # We run only X number of Experiments at the same time. index starts with 0 - if (index+1) % PARALLEL_EXECUTION == 0: - depends, tmp_depends = tmp_depends, [] - - return workflow diff --git a/test/e2e/v1beta1/scripts/aws/setup-katib.sh b/test/e2e/v1beta1/scripts/aws/setup-katib.sh index 1aee77eebc4..fe6e7061171 100755 --- a/test/e2e/v1beta1/scripts/aws/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/aws/setup-katib.sh @@ -51,7 +51,6 @@ make deploy # Wait until all Katib pods is running. TIMEOUT=120s -kubectl wait --for=condition=complete --timeout=${TIMEOUT} -l katib.kubeflow.org/component=cert-generator -n kubeflow job kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in (controller,db-manager,mysql,ui)" -n kubeflow pod echo "All Katib components are running." diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh index 5cd2c10ff5a..2ce492da79a 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh @@ -138,7 +138,6 @@ if "$DEPLOY_KATIB_UI"; then run "katib-ui" "${CMD_PREFIX}/ui/${VERSION}/Dockerfile" fi -run "cert-generator" "$CMD_PREFIX/cert-generator/$VERSION/Dockerfile" run "file-metrics-collector" "$CMD_PREFIX/metricscollector/$VERSION/file-metricscollector/Dockerfile" run "tfevent-metrics-collector" "$CMD_PREFIX/metricscollector/$VERSION/tfevent-metricscollector/Dockerfile" diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 61bdb5c3490..97c322b3db9 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -66,9 +66,6 @@ cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - # Wait until all Katib pods is running. TIMEOUT=120s -kubectl wait --for=condition=complete --timeout=${TIMEOUT} -l katib.kubeflow.org/component=cert-generator -n kubeflow job || - (kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1) - kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in ($WITH_DATABASE_TYPE,controller,db-manager,ui)" -n kubeflow pod || (kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1)