Skip to content

Commit

Permalink
[PLAT-14590] Fix permission check for otel-collector file existence c…
Browse files Browse the repository at this point in the history
…heck

Summary: Fix issue where on-prem node will be stuck in `Decommissioned` state due to incorrect removal/stopping of otel collector service during node cleanup. We assume that otel-collector is running only on user systemd.

Test Plan:
Create 6 node rf3 on-prem universe. Enable audit log export. Validate that `yb-server-ctl.sh` is updated with the new changes containing otel-collector.

Perform a replace or remove -> release action. Make sure that the node instance that is removed from the universe moves from `USED` -> `FREE` state.

Reviewers: sanketh, nsingh, yshchetinin

Reviewed By: nsingh

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D37276
  • Loading branch information
charleswang234 committed Sep 13, 2024
1 parent 3d59575 commit ec951f8
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 53 deletions.
19 changes: 9 additions & 10 deletions managed/devops/opscli/ybops/cloud/onprem/method.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,14 @@ def callback(self, args):

# First stop both tserver and master processes.
processes = ["tserver", "master", "controller"]
logging.info(("[app] Running control script to stop " +
"against master, tserver and controller at {}").format(host_info['name']))
self.cloud.run_control_script(processes[0], "stop-destroy", args,
self.extra_vars, host_info)
self.cloud.run_control_script(processes[1], "stop-destroy", args,
self.extra_vars, host_info)
self.cloud.run_control_script(processes[2], "stop-destroy", args,
self.extra_vars, host_info)
if args.clean_otel_collector and args.provisioning_cleanup:
processes.append("otel-collector")

for process in processes:
logging.info(("[app] Running control script to stop {} at {}")
.format(process, host_info['name']))
self.cloud.run_control_script(process, "stop-destroy", args,
self.extra_vars, host_info)

# Revert the force using of user yugabyte.
args.ssh_user = ssh_user
Expand All @@ -186,8 +186,7 @@ def callback(self, args):
"platform-services", "remove-services", args, self.extra_vars, host_info)

# Run non-db related tasks.
if ((args.clean_node_exporter or args.clean_otel_collector)
and args.provisioning_cleanup):
if args.clean_node_exporter and args.provisioning_cleanup:
logging.info(("[app] Running control script remove-services " +
"against thirdparty services at {}").format(host_info['name']))
self.cloud.run_control_script(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Daemons:
master
tserver
controller
otel-collector
Commands:
create - Start the YB process on this node in cluster creation node (only applicable for
master)
Expand Down Expand Up @@ -144,6 +145,9 @@ clean_data_paths() {
if [ "$daemon" == "controller" ]; then
rm -rf "${MOUNT_PATHS[i]}"/ybc-data
fi
if [ "$daemon" == "otel-collector" ]; then
rm -rf "${MOUNT_PATHS[i]}"/otel-collector
fi
done

print_err_out "Cleaning core files on `hostname`"
Expand Down Expand Up @@ -259,6 +263,16 @@ case "$daemon" in
exit 1
fi
;;
otel-collector)
if [ "$command" == "create" ]; then
echo "create command is not valid for otel-collector"
exit 1
fi
if [ "$command" == "start" || "$command" == "stop" ]; then
echo "stop and start command is not valid for otel-collector as is systemd only"
exit 1
fi
;;
*)
echo "Invalid Daemon: $daemon"
print_help
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@
state: link
force: yes

- name: Configure | Update yb server ctl script
vars:
mount_paths: "{{ _mount_points | join(' ') }}"
yb_cores_dir: "{{ yb_home_dir }}/cores"
template:
src: "roles/configure-cluster-server/templates/yb-server-ctl.sh.j2"
dest: "{{ yb_home_dir }}/bin/yb-server-ctl.sh"
owner: "{{ user_name }}"
mode: 0755

- name: Install OpenTelemetry collector | Check logs cleanup script exists
stat:
path: "{{ yb_home_dir }}/bin/zip_purge_yb_logs.sh"
Expand Down
67 changes: 24 additions & 43 deletions managed/devops/yb-server-ctl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,41 +48,44 @@
- name: Set the systemd prefix
set_fact:
systemd_prefix: "{{ 'systemctl --user' if systemd_user.stat.exists else 'sudo systemctl'}}"
- name: Set systemd name
set_fact:
systemd_process: "{{ process if process == 'otel-collector' else 'yb-' ~ process }}"
- name: Print systemd_prefix
debug:
var: systemd_prefix
- block:
- name: Reload daemon
shell: "{{ systemd_prefix }} daemon-reload"
- name: Enable the {{ process }}
shell: "{{ systemd_prefix }} enable yb-{{ process }}"
- name: Enable the {{ systemd_process }}
shell: "{{ systemd_prefix }} enable {{ systemd_process }}"
when: command == 'start'
- name: Perform {{ command }} on the {{ process }}
shell: "{{ systemd_prefix }} {{ command }} yb-{{ process }}"
- name: Perform {{ command }} on the {{ systemd_process }}
shell: "{{ systemd_prefix }} {{ command }} {{ systemd_process }}"
when: command == 'start' or command == 'stop'
- name: Disable the {{ process }}
shell: "{{ systemd_prefix }} disable yb-{{ process }}"
- name: Disable the {{ systemd_process }}
shell: "{{ systemd_prefix }} disable {{ systemd_process }}"
when: command == 'stop'
- name: Reload daemon
shell: "{{ systemd_prefix }} daemon-reload"
when: command != "stop-destroy"

- name: Stop the {{ process }} process with systemd on destroy
- name: Stop the {{ systemd_process }} process with systemd on destroy
block:
- name: Reload daemon
shell: "{{ systemd_prefix }} daemon-reload"
- name: yb-{{ process }} status output
shell: "{{ systemd_prefix }} status yb-{{ process }}"
- name: "{{ systemd_process }} status output"
shell: "{{ systemd_prefix }} status {{ systemd_process }}"
register: systemd_status_output
ignore_errors: True
- name: Print systemd_status_output.rc
debug:
msg: "{{ systemd_status_output.rc }}"
- name: Perform stop on the {{ process }}
shell: "{{ systemd_prefix }} stop yb-{{ process }}"
- name: Perform stop on the {{ systemd_process }}
shell: "{{ systemd_prefix }} stop {{ systemd_process }}"
when: systemd_status_output.rc != 4
- name: Disable the {{ process }}
shell: "{{ systemd_prefix }} disable yb-{{ process }}"
- name: Disable the {{ systemd_process }}
shell: "{{ systemd_prefix }} disable {{ systemd_process }}"
when: systemd_status_output.rc != 4
- name: Reload daemon
shell: "{{ systemd_prefix }} daemon-reload"
Expand All @@ -107,13 +110,9 @@
- block:
- set_fact:
prometheus_systemd_unit_dir: "/lib/systemd/system"
otel_systemd_dir: "/etc/systemd/system"
- set_fact:
prometheus_systemd_unit_dir: "/usr/lib/systemd/system"
when: ansible_os_family == "Suse"
- set_fact:
otel_systemd_dir: "{{ yb_home_dir }}/.config/systemd/user"
when: ansible_os_family != 'RedHat' or (ansible_distribution_major_version != '7' and not (ansible_distribution == 'Amazon' and ansible_distribution_major_version == '2'))

- block:
- name: Check node exporter service exists
Expand All @@ -140,37 +139,11 @@
become_method: sudo
when: clean_node_exporter is defined and clean_node_exporter|bool

- block:
- name: Check otel collector service exists
stat:
path: "{{ otel_systemd_dir }}/otel-collector.service"
register: otel_collector_stat
- name: Log otel_collector_stat
debug:
var: otel_collector_stat
- name: Stop otel collector service
service:
enabled: yes
name: otel-collector
state: stopped
become: yes
become_method: sudo
when: otel_collector_stat.stat.exists

- name: Delete otel collector service
file:
path: "{{ otel_systemd_dir }}/otel-collector.service"
state: absent
become: yes
become_method: sudo
when: clean_otel_collector is defined and clean_otel_collector|bool

- name: Perform daemon-reload for removed services
shell:
cmd: "systemctl daemon-reload"
become: yes
become_method: sudo

when: process == "thirdparty" and command == "remove-services"

- name: Removing platform services
Expand Down Expand Up @@ -266,6 +239,14 @@
become: yes
become_method: sudo

- name: Delete otel collector service
file:
path: "{{ systemd_dir }}/otel-collector.service"
state: absent
become: yes
become_method: sudo
when: clean_otel_collector is defined and clean_otel_collector|bool

- name: Perform daemon-reload for removed services
shell:
cmd: "systemctl daemon-reload"
Expand Down

0 comments on commit ec951f8

Please sign in to comment.