Skip to content
This repository has been archived by the owner on Feb 3, 2021. It is now read-only.

Commit

Permalink
Feature: Spark retry docker pull (#672)
Browse files Browse the repository at this point in the history
* retry docker pulls

* change order of pool, job, storage creation to reduce conflicts

* add error message on docker-compose curl failure
  • Loading branch information
jafreck committed Oct 24, 2018
1 parent 9e32b4b commit 18b74e4
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 10 deletions.
9 changes: 5 additions & 4 deletions aztk/client/cluster/helpers/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,9 @@ def create_pool_and_job_and_table(
:param VmImageModel: the type of image to provision for the cluster
:param wait: wait until the cluster is ready
"""
# update storage with the necessary values
# save cluster configuration in storage
core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)

if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
core_cluster_operations.create_task_table(cluster_conf.cluster_id)

# reuse pool_id as job_id
pool_id = cluster_conf.cluster_id
job_id = cluster_conf.cluster_id
Expand Down Expand Up @@ -71,4 +68,8 @@ def create_pool_and_job_and_table(
# Add job to batch
core_cluster_operations.batch_client.job.add(job)

# create storage task table
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
core_cluster_operations.create_task_table(cluster_conf.cluster_id)

return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client)
6 changes: 3 additions & 3 deletions aztk/client/job/helpers/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ def submit_job(
core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config(
job_configuration.to_cluster_config())

if job_configuration.scheduling_target != models.SchedulingTarget.Any:
core_job_operations.create_task_table(job_configuration.id)

# get a verified node agent sku
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client)
Expand Down Expand Up @@ -84,4 +81,7 @@ def submit_job(

core_job_operations.batch_client.job_schedule.add(setup)

if job_configuration.scheduling_target != models.SchedulingTarget.Any:
core_job_operations.create_task_table(job_configuration.id)

return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)
14 changes: 11 additions & 3 deletions aztk/node_scripts/setup_host.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,11 @@ install_prerequisites () {

install_docker_compose () {
echo "Installing Docker-Compose"
for i in {1..5}; do
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2;
url=https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m`
for i in {1..5}; do
sudo curl -L $url -o /usr/local/bin/docker-compose && break ||
echo "ERROR: failed to download docker-compose ... retrying in $($i**2) seconds" &&
sleep $i**2;
done
sudo chmod +x /usr/local/bin/docker-compose
echo "Finished installing Docker-Compose"
Expand All @@ -59,7 +62,12 @@ pull_docker_container () {
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
fi

docker pull $docker_repo_name

for i in {1..5}; do
docker pull $docker_repo_name && break ||
echo "ERROR: docker pull $docker_repo_name failed ... retrying after $($i**2) seconds" &&
sleep $i**2;
done
echo "Finished pulling $docker_repo_name"
}

Expand Down

0 comments on commit 18b74e4

Please sign in to comment.