Skip to content

Commit

Permalink
check logs of pod
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
  • Loading branch information
helenxie-bit committed Sep 24, 2024
1 parent dc684e3 commit a12034c
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 62 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/e2e-test-tune-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,14 @@ jobs:
- name: Fetch Experiment Pod Logs
if: always() # Run this step even if previous steps fail
run: |
echo "Fetching logs for experiment pod..."
echo "Fetching all the pods in the default namespace..."
kubectl get pods -n default
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
echo "Fetching pod description for experiment pod..."
kubectl describe pod $POD_NAME -n default
echo "Fetching logs for experiment pod..."
kubectl logs $POD_NAME -n default --all-containers
echo "Fetching events for experiment pod..."
kubectl get events -n default | grep "tune-example-2"
# Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs)
Expand Down
62 changes: 1 addition & 61 deletions test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,64 +19,6 @@
# The default logging config.
logging.basicConfig(level=logging.INFO)


def get_experiment_pods_logs(katib_client: KatibClient, exp_name: str, exp_namespace: str):
# List all the pods in the namespace
v1 = client.CoreV1Api()
pods = v1.list_namespaced_pod(namespace=exp_namespace)

# Filter pods related to the specific Katib Experiment
for pod in pods.items:
if exp_name in pod.metadata.name:
logging.info(f"Fetching logs for pod: {pod.metadata.name}")
try:
# Specify the container name when retrieving logs
pod_logs1 = v1.read_namespaced_pod_log(
name=pod.metadata.name,
namespace=exp_namespace,
container="metrics-logger-and-collector"
)
logging.info(f"Logs of metrics-logger-and-collector for pod {pod.metadata.name}:\n{pod_logs1}")
pod_logs2 = v1.read_namespaced_pod_log(
name=pod.metadata.name,
namespace=exp_namespace,
container="pytorch"
)
logging.info(f"Logs of pytorch for pod {pod.metadata.name}:\n{pod_logs2}")
pod_logs3 = v1.read_namespaced_pod_log(
name=pod.metadata.name,
namespace=exp_namespace,
container="storage-initializer"
)
logging.info(f"Logs of storage-initializer for pod {pod.metadata.name}:\n{pod_logs3}")
pod_logs4 = v1.read_namespaced_pod_log(
name=pod.metadata.name,
namespace=exp_namespace,
)
logging.info(f"Logs for pod {pod.metadata.name}:\n{pod_logs4}")
except Exception as e:
logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")

def get_experiment_pods_logs_2(katib_client: KatibClient, exp_name: str, exp_namespace: str):
# List all the pods in the namespace
v1 = client.CoreV1Api()
pods = v1.list_namespaced_pod(namespace=exp_namespace)

# Filter pods related to the specific Katib Experiment
for pod in pods.items:
if exp_name in pod.metadata.name:
logging.info(f"Fetching logs for pod: {pod.metadata.name}")
try:
# Specify the container name when retrieving logs
pod_logs = v1.read_namespaced_pod_log(
name=pod.metadata.name,
namespace=exp_namespace,
)
logging.info(f"Logs for pod {pod.metadata.name} (container: metrics-logger-and-collector):\n{pod_logs}")
except Exception as e:
logging.error(f"Failed to get logs for pod {pod.metadata.name}: {str(e)}")


# Test for Experiment created with custom objective.
def run_e2e_experiment_create_by_tune_with_custom_objective(
katib_client: KatibClient,
Expand Down Expand Up @@ -180,6 +122,7 @@ def run_e2e_experiment_create_by_tune_with_external_model(
"size": "10Gi",
"access_modes": ["ReadWriteOnce"],
},
retain_trials=True,
)
experiment = katib_client.wait_for_experiment_condition(
exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
Expand Down Expand Up @@ -236,9 +179,6 @@ def run_e2e_experiment_create_by_tune_with_external_model(
except Exception as e:
logging.info("---------------------------------------------------------------")
logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}-2")
get_experiment_pods_logs(katib_client, f"{exp_name}-2", exp_namespace)
get_experiment_pods_logs_2(katib_client, "katib-controller", "kubeflow")
get_experiment_pods_logs_2(katib_client, "training-operator", "kubeflow")
raise e
finally:
# Delete the Experiment.
Expand Down

0 comments on commit a12034c

Please sign in to comment.