Skip to content

Commit

Permalink
Merge pull request #335 from populationgenomics/upstream-14454
Browse files Browse the repository at this point in the history
Merge upstream HEAD(bea04d9, 2024-05-21) [release] 0.2.130 (hail-is#14454)
  • Loading branch information
milo-hyben authored May 24, 2024
2 parents 637f17f + f8a4e39 commit 6f76e36
Show file tree
Hide file tree
Showing 346 changed files with 2,091 additions and 5,817 deletions.
2 changes: 2 additions & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ da2790242a40ec425a53a02707d261c893b264f7
422edf6386616711ca70f87c455f76781ac925d4
# replaces black formatting with ruff
fa2ef0f2c76654d0c037ff6db60ccb8842fb8539
# ruff lint python imports
01a6a6a107faf204d4f5c20f8ae510d2c35518e9
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ check-pip-requirements:
hail/python/dev \
gear \
web_common \
auth \
batch \
ci

Expand All @@ -98,7 +97,6 @@ check-linux-pip-requirements:
hail/python/dev \
gear \
web_common \
auth \
batch \
ci

Expand Down
2 changes: 2 additions & 0 deletions batch/batch/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn
'cost': coalesce(record.get('cost'), 0),
'msec_mcpu': record['msec_mcpu'],
'cost_breakdown': cost_breakdown,
'always_run': bool(record['always_run']),
'display_state': None,
},
)

Expand Down
15 changes: 11 additions & 4 deletions batch/batch/cloud/gcp/worker/worker_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import os
import tempfile
from contextlib import AsyncExitStack
from typing import Dict, List

import orjson
Expand All @@ -20,13 +21,13 @@
class GCPWorkerAPI(CloudWorkerAPI):
nameserver_ip = '169.254.169.254'

# async because GoogleSession must be created inside a running event loop
# async because ClientSession must be created inside a running event loop
@staticmethod
async def from_env() -> 'GCPWorkerAPI':
project = os.environ['PROJECT']
zone = os.environ['ZONE'].rsplit('/', 1)[1]
worker_credentials = aiogoogle.GoogleInstanceMetadataCredentials()
http_session = httpx.ClientSession()
http_session = httpx.client_session()
return GCPWorkerAPI(project, zone, worker_credentials, http_session)

def __init__(
Expand All @@ -38,9 +39,15 @@ def __init__(
):
self.project = project
self.zone = zone

self._exit_stack = AsyncExitStack()
self._http_session = http_session
self._metadata_server_client = aiogoogle.GoogleMetadataServerClient(http_session)
self._exit_stack.push_async_callback(self._http_session.close)

self._compute_client = aiogoogle.GoogleComputeClient(project)
self._exit_stack.push_async_callback(self._compute_client.close)

self._metadata_server_client = aiogoogle.GoogleMetadataServerClient(http_session)
self._gcsfuse_credential_files: Dict[str, str] = {}
self._worker_credentials = worker_credentials

Expand Down Expand Up @@ -132,7 +139,7 @@ async def unmount_cloudfuse(self, mount_base_path_data: str):
del self._gcsfuse_credential_files[mount_base_path_data]

async def close(self):
await self._compute_client.close()
await self._exit_stack.aclose()

def __str__(self):
return f'project={self.project} zone={self.zone}'
5 changes: 5 additions & 0 deletions batch/batch/front_end/front_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -2449,6 +2449,11 @@ async def ui_batch(request, userdata, batch_id):
for j in jobs:
j['duration'] = humanize_timedelta_msecs(j['duration'])
j['cost'] = cost_str(j['cost'])
j['display_state'] = (
f"{j['state']} (always run)"
if j['always_run'] and j['state'] not in {'Success', 'Failed', 'Error'}
else j['state']
)
batch['jobs'] = jobs

batch['cost'] = cost_str(batch['cost'])
Expand Down
2 changes: 1 addition & 1 deletion batch/batch/front_end/templates/batch.html
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ <h2>Jobs</h2>
{{ job['name'] }}
{% endif %}
</td>
<td>{{ job['state'] }}</td>
<td>{{ job['display_state'] }}</td>
<td>
{% if 'exit_code' in job and job['exit_code'] is not none %}
{{ job['exit_code'] }}
Expand Down
14 changes: 7 additions & 7 deletions batch/batch/front_end/templates/billing.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@ <h1>Billing</h1>
</div>
</div>

<h2>Total Cost</h2>
<h2>Total Spend</h2>
<ul>
<li>{{ total_cost }}</li>
</ul>

{% if is_developer %}
<h2>Cost by Billing Project</h2>
<h2>Spend by Billing Project</h2>
<div class='flex-col' style="overflow: auto;">
<table class="data-table" id="billing_by_project">
<thead>
<tr>
<th>Billing Project</th>
<th>Cost</th>
<th>Spend</th>
</tr>
</thead>
<tbody>
Expand All @@ -60,13 +60,13 @@ <h2>Cost by Billing Project</h2>
</table>
</div>

<h2>Cost by User</h2>
<h2>Spend by User</h2>
<div class='flex-col' style="overflow: auto;">
<table class="data-table" id="billing_by_user">
<thead>
<tr>
<th>User</th>
<th>Cost</th>
<th>Spend</th>
</tr>
</thead>
<tbody>
Expand All @@ -81,14 +81,14 @@ <h2>Cost by User</h2>
</div>
{% endif %}

<h2>Cost by Billing Project and User</h2>
<h2>Spend by Billing Project and User</h2>
<div class='flex-col' style="overflow: auto;">
<table class="data-table" id="billing_by_project_user">
<thead>
<tr>
<th>Billing Project</th>
<th>User</th>
<th>Cost</th>
<th>Spend</th>
</tr>
</thead>
<tbody>
Expand Down
1 change: 1 addition & 0 deletions batch/batch/front_end/templates/job.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ <h2>Properties</h2>
<li>Exit Code: {% if 'exit_code' in job and job['exit_code'] is not none %}{{ job['exit_code'] }}{% endif %}</li>
<li>Duration: {% if 'duration' in job and job['duration'] is not none %}{{ job['duration'] }}{% endif %}</li>
<li>Cost: {% if 'cost' in job and job['cost'] is not none %}{{ job['cost'] }}{% endif %}</li>
<li>Always Run: {% if 'always_run' in job and job['always_run'] is not none %}{{ job['always_run'] }}{% endif %}</li>
</ul>

<h2>Attributes</h2>
Expand Down
3 changes: 1 addition & 2 deletions batch/batch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ async def query_billing_projects_with_cost(db, user=None, billing_project=None)
) AS usage_t
LEFT JOIN resources ON resources.resource_id = usage_t.resource_id
) AS cost_t ON TRUE
{where_condition}
LOCK IN SHARE MODE;
{where_condition};
"""

billing_projects = []
Expand Down
6 changes: 3 additions & 3 deletions batch/batch/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2936,7 +2936,7 @@ def __repr__(self):


class Worker:
def __init__(self, client_session: httpx.ClientSession):
def __init__(self):
self.active = False
self.cores_mcpu = CORES * 1000
self.last_updated = time_msecs()
Expand All @@ -2948,7 +2948,7 @@ def __init__(self, client_session: httpx.ClientSession):
self.task_manager = aiotools.BackgroundTaskManager()
os.makedirs('/hail-jars/', exist_ok=True)
self.jar_download_locks: Dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
self.client_session = client_session
self.client_session = httpx.client_session()

self.image_data: Dict[str, ImageData] = defaultdict(ImageData)
self.image_data[BATCH_WORKER_IMAGE_ID] += 1
Expand Down Expand Up @@ -3452,7 +3452,7 @@ async def async_main():
network_allocator = NetworkAllocator(network_allocator_task_manager)
await network_allocator.reserve()

worker = Worker(httpx.client_session())
worker = Worker()
try:
async with AsyncExitStack() as cleanup:
cleanup.push_async_callback(docker.close)
Expand Down
18 changes: 12 additions & 6 deletions batch/test/test_batch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import collections
import os
import secrets
Expand Down Expand Up @@ -1497,18 +1498,23 @@ def test_pool_standard_instance_cheapest(client: BatchClient):

# Transitively is not valid for terra
@skip_in_azure
@pytest.mark.timeout(10 * 60)
def test_gpu_accesibility_g2(client: BatchClient):
b = create_batch(client)
async def test_gpu_accesibility_g2(client: BatchClient):
b = create_batch(client)._async_batch
resources = {'machine_type': "g2-standard-4", 'storage': '100Gi'}
j = b.create_job(
os.environ['HAIL_GPU_IMAGE'],
['python3', '-c', 'import torch; assert torch.cuda.is_available()'],
resources=resources,
)
b.submit()
status = j.wait()
assert status['state'] == 'Success', str((status, b.debug_info()))
await b.submit()
try:
status = await asyncio.wait_for(j.wait(), timeout=5 * 60)
assert status['state'] == 'Success', str((status, b.debug_info()))
except asyncio.TimeoutError:
# G2 instances are not always available within a time window
# acceptable for CI. This test is permitted to time out
# but not otherwise fail
pass


def test_job_private_instance_preemptible(client: BatchClient):
Expand Down
10 changes: 4 additions & 6 deletions benchmark/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
include ../config.mk
include ../hail/version.mk

SHORT_REVISION := $(shell git rev-parse --short=12 HEAD)

HAIL_PYTHON3 ?= python3
PIP := $(HAIL_PYTHON3) -m pip

Expand Down Expand Up @@ -53,21 +51,21 @@ pushed_image: image

BENCHMARK_ITERS ?= 3
BENCHMARK_REPLICATES ?= 5
HAIL_WHEEL_DESCRIPTOR ?= $(HAIL_PIP_VERSION)-$(SHORT_REVISION)
BENCHMARK_BUCKET ?= gs://hail-benchmarks-2
.PHONY: submit
submit: pushed_image install
@echo Using pushed image `cat pushed_image`
$(HAIL_PYTHON3) scripts/benchmark_in_batch.py \
`cat pushed_image` \
$(BENCHMARK_BUCKET)/$(shell whoami) \
$(HAIL_WHEEL_DESCRIPTOR) \
$(HAIL_VERSION) \
$(BENCHMARK_REPLICATES) \
$(BENCHMARK_ITERS)
$(BENCHMARK_ITERS) \
run

clean: cleanup_image
rm -rf python/dist/*
rm -rf python/build/*
rm -r $(HAIL_BENCHMARK_VERSION_FILE)
rm -f $(HAIL_BENCHMARK_VERSION_FILE)

FORCE:
Loading

0 comments on commit 6f76e36

Please sign in to comment.