hail-is · danking · Feb 26, 2024 · Oct 16, 2023 · Nov 9, 2023 · Nov 13, 2023
diff --git a/batch/batch/batch.py b/batch/batch/batch.py
@@ -1,18 +1,25 @@
 import json
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, cast
 
 from gear import transaction
-from hailtop.batch_client.types import CostBreakdownEntry, JobListEntryV1Alpha
+from hailtop.batch_client.globals import ROOT_JOB_GROUP_ID
+from hailtop.batch_client.types import CostBreakdownEntry, GetJobGroupResponseV1Alpha, JobListEntryV1Alpha
 from hailtop.utils import humanize_timedelta_msecs, time_msecs_str
 
 from .batch_format_version import BatchFormatVersion
-from .exceptions import NonExistentBatchError, OpenBatchError
+from .exceptions import NonExistentJobGroupError
 from .utils import coalesce
 
 log = logging.getLogger('batch')
 
 
+def _maybe_time_msecs_str(t: Optional[int]) -> Optional[str]:
+    if t is not None:
+        return time_msecs_str(t)
+    return None
+
+
 def cost_breakdown_to_dict(cost_breakdown: Dict[str, float]) -> List[CostBreakdownEntry]:
     return [{'resource': resource, 'cost': cost} for resource, cost in cost_breakdown.items()]
 
@@ -30,14 +37,9 @@ def batch_record_to_dict(record: Dict[str, Any]) -> Dict[str, Any]:
     else:
         state = 'running'
 
-    def _time_msecs_str(t):
-        if t:
-            return time_msecs_str(t)
-        return None
-
-    time_created = _time_msecs_str(record['time_created'])
-    time_closed = _time_msecs_str(record['time_closed'])
-    time_completed = _time_msecs_str(record['time_completed'])
+    time_created = _maybe_time_msecs_str(record['time_created'])
+    time_closed = _maybe_time_msecs_str(record['time_closed'])
+    time_completed = _maybe_time_msecs_str(record['time_completed'])
 
     if record['time_created'] and record['time_completed']:
         duration_ms = record['time_completed'] - record['time_created']
@@ -49,7 +51,7 @@ def _time_msecs_str(t):
     if record['cost_breakdown'] is not None:
         record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown']))
 
-    d = {
+    batch_response = {
         'id': record['id'],
         'user': record['user'],
         'billing_project': record['billing_project'],
@@ -74,9 +76,55 @@ def _time_msecs_str(t):
 
     attributes = json.loads(record['attributes'])
     if attributes:
-        d['attributes'] = attributes
+        batch_response['attributes'] = attributes
+
+    return batch_response
+
+
+def job_group_record_to_dict(record: Dict[str, Any]) -> GetJobGroupResponseV1Alpha:
+    if record['n_failed'] > 0:
+        state = 'failure'
+    elif record['cancelled'] or record['n_cancelled'] > 0:
+        state = 'cancelled'
+    elif record['state'] == 'complete':
+        assert record['n_succeeded'] == record['n_jobs']
+        state = 'success'
+    else:
+        state = 'running'
 
-    return d
+    time_created = _maybe_time_msecs_str(record['time_created'])
+    time_completed = _maybe_time_msecs_str(record['time_completed'])
+
+    if record['time_created'] and record['time_completed']:
+        duration_ms = record['time_completed'] - record['time_created']
+    else:
+        duration_ms = None
+
+    if record['cost_breakdown'] is not None:
+        record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown']))
+
+    job_group_response = {
+        'batch_id': record['batch_id'],
+        'job_group_id': record['job_group_id'],
+        'state': state,
+        'complete': record['state'] == 'complete',
+        'n_jobs': record['n_jobs'],
+        'n_completed': record['n_completed'],
+        'n_succeeded': record['n_succeeded'],
+        'n_failed': record['n_failed'],
+        'n_cancelled': record['n_cancelled'],
+        'time_created': time_created,
+        'time_completed': time_completed,
+        'duration': duration_ms,
+        'cost': coalesce(record['cost'], 0),
+        'cost_breakdown': record['cost_breakdown'],
+    }
+
+    attributes = json.loads(record['attributes'])
+    if attributes:
+        job_group_response['attributes'] = attributes
+
+    return cast(GetJobGroupResponseV1Alpha, job_group_response)
 
 
 def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEntryV1Alpha:
@@ -93,38 +141,42 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn
     if record['cost_breakdown'] is not None:
         record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown']))
 
-    return {
-        'batch_id': record['batch_id'],
-        'job_id': record['job_id'],
-        'name': name,
-        'user': record['user'],
-        'billing_project': record['billing_project'],
-        'state': record['state'],
-        'exit_code': exit_code,
-        'duration': duration,
-        'cost': coalesce(record['cost'], 0),
-        'msec_mcpu': record['msec_mcpu'],
-        'cost_breakdown': record['cost_breakdown'],
-    }
-
-
-async def cancel_batch_in_db(db, batch_id):
+    return cast(
+        JobListEntryV1Alpha,
+        {
+            'batch_id': record['batch_id'],
+            'job_id': record['job_id'],
+            'name': name,
+            'user': record['user'],
+            'billing_project': record['billing_project'],
+            'state': record['state'],
+            'exit_code': exit_code,
+            'duration': duration,
+            'cost': coalesce(record['cost'], 0),
+            'msec_mcpu': record['msec_mcpu'],
+            'cost_breakdown': record['cost_breakdown'],
+        },
+    )
+
+
+async def cancel_job_group_in_db(db, batch_id, job_group_id):
     @transaction(db)
     async def cancel(tx):
         record = await tx.execute_and_fetchone(
             """
-SELECT `state` FROM batches
-WHERE id = %s AND NOT deleted
+SELECT 1
+FROM job_groups
+LEFT JOIN batches ON batches.id = job_groups.batch_id
+LEFT JOIN batch_updates ON job_groups.batch_id = batch_updates.batch_id AND
+  job_groups.update_id = batch_updates.update_id
+WHERE job_groups.batch_id = %s AND job_groups.job_group_id = %s AND NOT deleted AND (batch_updates.committed OR job_groups.job_group_id = %s)
 FOR UPDATE;
 """,
-            (batch_id,),
+            (batch_id, job_group_id, ROOT_JOB_GROUP_ID),
         )
         if not record:
-            raise NonExistentBatchError(batch_id)
-
-        if record['state'] == 'open':
-            raise OpenBatchError(batch_id)
+            raise NonExistentJobGroupError(batch_id, job_group_id)
 
-        await tx.just_execute('CALL cancel_batch(%s);', (batch_id,))
+        await tx.just_execute('CALL cancel_job_group(%s, %s);', (batch_id, job_group_id))
 
     await cancel()
diff --git a/batch/batch/constants.py b/batch/batch/constants.py
diff --git a/batch/batch/driver/canceller.py b/batch/batch/driver/canceller.py
@@ -94,39 +94,44 @@ async def cancel_cancelled_ready_jobs_loop_body(self):
         }
 
         async def user_cancelled_ready_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id, job_groups_cancelled.id IS NOT NULL AS cancelled
-FROM batches
-LEFT JOIN job_groups_cancelled
-       ON batches.id = job_groups_cancelled.id
+SELECT job_groups.batch_id, job_groups.job_group_id, t.cancelled IS NOT NULL AS cancelled
+FROM job_groups
+LEFT JOIN LATERAL (
+  SELECT 1 AS cancelled
+  FROM job_group_self_and_ancestors
+  INNER JOIN job_groups_cancelled
+    ON job_group_self_and_ancestors.batch_id = job_groups_cancelled.id AND
+      job_group_self_and_ancestors.ancestor_id = job_groups_cancelled.job_group_id
+  WHERE job_groups.batch_id = job_group_self_and_ancestors.batch_id AND
+    job_groups.job_group_id = job_group_self_and_ancestors.job_group_id
+) AS t ON TRUE
 WHERE user = %s AND `state` = 'running';
 """,
                 (user,),
             ):
-                if batch['cancelled']:
+                if job_group['cancelled']:
                     async for record in self.db.select_and_fetchall(
                         """
-SELECT jobs.job_id
+SELECT jobs.batch_id, jobs.job_id, jobs.job_group_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
-WHERE batch_id = %s AND state = 'Ready' AND always_run = 0
+WHERE batch_id = %s AND job_group_id = %s AND state = 'Ready' AND always_run = 0
 LIMIT %s;
 """,
-                        (batch['id'], remaining.value),
+                        (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                     ):
-                        record['batch_id'] = batch['id']
                         yield record
                 else:
                     async for record in self.db.select_and_fetchall(
                         """
-SELECT jobs.job_id
+SELECT jobs.batch_id, jobs.job_id, jobs.job_group_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
-WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 1
+WHERE batch_id = %s AND job_group_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 1
 LIMIT %s;
 """,
-                        (batch['id'], remaining.value),
+                        (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                     ):
-                        record['batch_id'] = batch['id']
                         yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)
@@ -137,18 +142,30 @@ async def user_cancelled_ready_jobs(user, remaining) -> AsyncIterator[Dict[str,
             async for record in user_cancelled_ready_jobs(user, remaining):
                 batch_id = record['batch_id']
                 job_id = record['job_id']
+                job_group_id = record['job_group_id']
                 id = (batch_id, job_id)
                 log.info(f'cancelling job {id}')
 
-                async def cancel_with_error_handling(app, batch_id, job_id, id):
+                async def cancel_with_error_handling(app, batch_id, job_id, job_group_id, id):
                     try:
                         await mark_job_complete(
-                            app, batch_id, job_id, None, None, 'Cancelled', None, None, None, 'cancelled', []
+                            app,
+                            batch_id,
+                            job_id,
+                            None,
+                            job_group_id,
+                            None,
+                            'Cancelled',
+                            None,
+                            None,
+                            None,
+                            'cancelled',
+                            [],
                         )
                     except Exception:
                         log.info(f'error while cancelling job {id}', exc_info=True)
 
-                await waitable_pool.call(cancel_with_error_handling, self.app, batch_id, job_id, id)
+                await waitable_pool.call(cancel_with_error_handling, self.app, batch_id, job_id, job_group_id, id)
 
                 remaining.value -= 1
                 if remaining.value <= 0:
@@ -182,28 +199,34 @@ async def cancel_cancelled_creating_jobs_loop_body(self):
         }
 
         async def user_cancelled_creating_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id
-FROM batches
-INNER JOIN job_groups_cancelled
-        ON batches.id = job_groups_cancelled.id
+SELECT job_groups.batch_id, job_groups.job_group_id
+FROM job_groups
+INNER JOIN LATERAL (
+  SELECT 1 AS cancelled
+  FROM job_group_self_and_ancestors
+  INNER JOIN job_groups_cancelled
+    ON job_group_self_and_ancestors.batch_id = job_groups_cancelled.id AND
+      job_group_self_and_ancestors.ancestor_id = job_groups_cancelled.job_group_id
+  WHERE job_groups.batch_id = job_group_self_and_ancestors.batch_id AND
+    job_groups.job_group_id = job_group_self_and_ancestors.job_group_id
+) AS t ON TRUE
 WHERE user = %s AND `state` = 'running';
 """,
                 (user,),
             ):
                 async for record in self.db.select_and_fetchall(
                     """
-SELECT jobs.job_id, attempts.attempt_id, attempts.instance_name
+SELECT jobs.batch_id, jobs.job_id, attempts.attempt_id, attempts.instance_name, jobs.job_group_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
 STRAIGHT_JOIN attempts
   ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
-WHERE jobs.batch_id = %s AND state = 'Creating' AND always_run = 0 AND cancelled = 0
+WHERE jobs.batch_id = %s AND jobs.job_group_id = %s AND state = 'Creating' AND always_run = 0 AND cancelled = 0
 LIMIT %s;
 """,
-                    (batch['id'], remaining.value),
+                    (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                 ):
-                    record['batch_id'] = batch['id']
                     yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)
@@ -215,17 +238,21 @@ async def user_cancelled_creating_jobs(user, remaining) -> AsyncIterator[Dict[st
                 batch_id = record['batch_id']
                 job_id = record['job_id']
                 attempt_id = record['attempt_id']
+                job_group_id = record['job_group_id']
                 instance_name = record['instance_name']
                 id = (batch_id, job_id)
 
-                async def cancel_with_error_handling(app, batch_id, job_id, attempt_id, instance_name, id):
+                async def cancel_with_error_handling(
+                    app, batch_id, job_id, attempt_id, job_group_id, instance_name, id
+                ):
                     try:
                         end_time = time_msecs()
                         await mark_job_complete(
                             app,
                             batch_id,
                             job_id,
                             attempt_id,
+                            job_group_id,
                             instance_name,
                             'Cancelled',
                             None,
@@ -246,7 +273,7 @@ async def cancel_with_error_handling(app, batch_id, job_id, attempt_id, instance
                         log.info(f'cancelling creating job {id} on instance {instance_name}', exc_info=True)
 
                 await waitable_pool.call(
-                    cancel_with_error_handling, self.app, batch_id, job_id, attempt_id, instance_name, id
+                    cancel_with_error_handling, self.app, batch_id, job_id, attempt_id, job_group_id, instance_name, id
                 )
 
                 remaining.value -= 1
@@ -279,28 +306,34 @@ async def cancel_cancelled_running_jobs_loop_body(self):
         }
 
         async def user_cancelled_running_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id
-FROM batches
-INNER JOIN job_groups_cancelled
-        ON batches.id = job_groups_cancelled.id
+SELECT job_groups.batch_id, job_groups.job_group_id
+FROM job_groups
+INNER JOIN LATERAL (
+  SELECT 1 AS cancelled
+  FROM job_group_self_and_ancestors
+  INNER JOIN job_groups_cancelled
+    ON job_group_self_and_ancestors.batch_id = job_groups_cancelled.id AND
+      job_group_self_and_ancestors.ancestor_id = job_groups_cancelled.job_group_id
+  WHERE job_groups.batch_id = job_group_self_and_ancestors.batch_id AND
+    job_groups.job_group_id = job_group_self_and_ancestors.job_group_id
+) AS t ON TRUE
 WHERE user = %s AND `state` = 'running';
 """,
                 (user,),
             ):
                 async for record in self.db.select_and_fetchall(
                     """
-SELECT jobs.job_id, attempts.attempt_id, attempts.instance_name
+SELECT jobs.batch_id, jobs.job_id, attempts.attempt_id, attempts.instance_name
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
 STRAIGHT_JOIN attempts
   ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
-WHERE jobs.batch_id = %s AND state = 'Running' AND always_run = 0 AND cancelled = 0
+WHERE jobs.batch_id = %s AND jobs.job_group_id = %s AND state = 'Running' AND always_run = 0 AND cancelled = 0
 LIMIT %s;
 """,
-                    (batch['id'], remaining.value),
+                    (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                 ):
-                    record['batch_id'] = batch['id']
                     yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)