hail-is · jigold · Oct 16, 2023 · Nov 9, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/batch/batch/batch.py b/batch/batch/batch.py
@@ -3,16 +3,23 @@
 from typing import Any, Dict, List, Optional
 
 from gear import transaction
-from hailtop.batch_client.types import CostBreakdownEntry, JobListEntryV1Alpha
+from hailtop.batch_client.types import CostBreakdownEntry, GetJobGroupResponseV1Alpha, JobListEntryV1Alpha
 from hailtop.utils import humanize_timedelta_msecs, time_msecs_str
 
 from .batch_format_version import BatchFormatVersion
-from .exceptions import NonExistentBatchError, OpenBatchError
+from .constants import ROOT_JOB_GROUP_ID
+from .exceptions import NonExistentJobGroupError
 from .utils import coalesce
 
 log = logging.getLogger('batch')
 
 
+def _maybe_time_msecs_str(t):
+    if t:
+        return time_msecs_str(t)
+    return None
+
+
 def cost_breakdown_to_dict(cost_breakdown: Dict[str, float]) -> List[CostBreakdownEntry]:
     return [{'resource': resource, 'cost': cost} for resource, cost in cost_breakdown.items()]
 
@@ -30,14 +37,9 @@ def batch_record_to_dict(record: Dict[str, Any]) -> Dict[str, Any]:
     else:
         state = 'running'
 
-    def _time_msecs_str(t):
-        if t:
-            return time_msecs_str(t)
-        return None
-
-    time_created = _time_msecs_str(record['time_created'])
-    time_closed = _time_msecs_str(record['time_closed'])
-    time_completed = _time_msecs_str(record['time_completed'])
+    time_created = _maybe_time_msecs_str(record['time_created'])
+    time_closed = _maybe_time_msecs_str(record['time_closed'])
+    time_completed = _maybe_time_msecs_str(record['time_completed'])
 
     if record['time_created'] and record['time_completed']:
         duration_ms = record['time_completed'] - record['time_created']
@@ -79,6 +81,52 @@ def _time_msecs_str(t):
     return d
 
 
+def job_group_record_to_dict(record: Dict[str, Any]) -> GetJobGroupResponseV1Alpha:
+    if record['n_failed'] > 0:
+        state = 'failure'
+    elif record['cancelled'] or record['n_cancelled'] > 0:
+        state = 'cancelled'
+    elif record['state'] == 'complete':
+        assert record['n_succeeded'] == record['n_jobs']
+        state = 'success'
+    else:
+        state = 'running'
+
+    time_created = _maybe_time_msecs_str(record['time_created'])
+    time_completed = _maybe_time_msecs_str(record['time_completed'])
+
+    if record['time_created'] and record['time_completed']:
+        duration_ms = record['time_completed'] - record['time_created']
+    else:
+        duration_ms = None
+
+    if record['cost_breakdown'] is not None:
+        record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown']))
+
+    d = {
+        'batch_id': record['batch_id'],
+        'job_group_id': record['job_group_id'],
+        'state': state,
+        'complete': record['state'] == 'complete',
+        'n_jobs': record['n_jobs'],
+        'n_completed': record['n_completed'],
+        'n_succeeded': record['n_succeeded'],
+        'n_failed': record['n_failed'],
+        'n_cancelled': record['n_cancelled'],
+        'time_created': time_created,
+        'time_completed': time_completed,
+        'duration': duration_ms,
+        'cost': coalesce(record['cost'], 0),
+        'cost_breakdown': record['cost_breakdown'],
+    }
+
+    attributes = json.loads(record['attributes'])
+    if attributes:
+        d['attributes'] = attributes
+
+    return d
+
+
 def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEntryV1Alpha:
     format_version = BatchFormatVersion(record['format_version'])
 
@@ -108,23 +156,24 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn
     }
 
 
-async def cancel_batch_in_db(db, batch_id):
+async def cancel_job_group_in_db(db, batch_id, job_group_id):
     @transaction(db)
     async def cancel(tx):
         record = await tx.execute_and_fetchone(
             """
-SELECT `state` FROM batches
-WHERE id = %s AND NOT deleted
+SELECT `state`
+FROM job_groups
+LEFT JOIN batches ON batches.id = job_groups.batch_id
+LEFT JOIN batch_updates ON job_groups.batch_id = batch_updates.batch_id AND
+  job_groups.update_id = batch_updates.update_id
+WHERE batch_id = %s AND job_group_id = %s AND NOT deleted AND (batch_updates.committed OR job_groups.job_group_id = %s)
 FOR UPDATE;
 """,
-            (batch_id,),
+            (batch_id, job_group_id, ROOT_JOB_GROUP_ID),
         )
         if not record:
-            raise NonExistentBatchError(batch_id)
-
-        if record['state'] == 'open':
-            raise OpenBatchError(batch_id)
+            raise NonExistentJobGroupError(batch_id, job_group_id)
 
-        await tx.just_execute('CALL cancel_batch(%s);', (batch_id,))
+        await tx.just_execute('CALL cancel_job_group(%s, %s);', (batch_id, job_group_id))
 
     await cancel()
diff --git a/batch/batch/driver/canceller.py b/batch/batch/driver/canceller.py
@@ -94,39 +94,40 @@ async def cancel_cancelled_ready_jobs_loop_body(self):
         }
 
         async def user_cancelled_ready_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id, job_groups_cancelled.id IS NOT NULL AS cancelled
-FROM batches
+SELECT job_groups.batch_id, job_groups.job_group_id, job_groups_cancelled.id IS NOT NULL AS cancelled
+FROM job_groups
 LEFT JOIN job_groups_cancelled
-       ON batches.id = job_groups_cancelled.id
+       ON job_groups.batch_id = job_groups_cancelled.id AND
+          job_groups.job_group_id = job_groups_cancelled.job_group_id
 WHERE user = %s AND `state` = 'running';
 """,
                 (user,),
             ):
-                if batch['cancelled']:
+                if job_group['cancelled']:
                     async for record in self.db.select_and_fetchall(
                         """
 SELECT jobs.job_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
-WHERE batch_id = %s AND state = 'Ready' AND always_run = 0
+WHERE batch_id = %s AND job_group_id = %s AND state = 'Ready' AND always_run = 0
 LIMIT %s;
 """,
-                        (batch['id'], remaining.value),
+                        (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                     ):
-                        record['batch_id'] = batch['id']
+                        record['batch_id'] = job_group['batch_id']
                         yield record
                 else:
                     async for record in self.db.select_and_fetchall(
                         """
 SELECT jobs.job_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
-WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 1
+WHERE batch_id = %s AND job_group_id = %s AND state = 'Ready' AND always_run = 0 AND cancelled = 1
 LIMIT %s;
 """,
-                        (batch['id'], remaining.value),
+                        (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                     ):
-                        record['batch_id'] = batch['id']
+                        record['batch_id'] = job_group['batch_id']
                         yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)
@@ -182,12 +183,13 @@ async def cancel_cancelled_creating_jobs_loop_body(self):
         }
 
         async def user_cancelled_creating_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id
-FROM batches
+SELECT job_groups.batch_id, job_groups.job_group_id
+FROM job_groups
 INNER JOIN job_groups_cancelled
-        ON batches.id = job_groups_cancelled.id
+  ON job_groups.batch_id = job_groups_cancelled.id AND
+     job_groups.job_group_id = job_groups_cancelled.job_group_id
 WHERE user = %s AND `state` = 'running';
 """,
                 (user,),
@@ -198,12 +200,12 @@ async def user_cancelled_creating_jobs(user, remaining) -> AsyncIterator[Dict[st
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
 STRAIGHT_JOIN attempts
   ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
-WHERE jobs.batch_id = %s AND state = 'Creating' AND always_run = 0 AND cancelled = 0
+WHERE jobs.batch_id = %s AND jobs.job_group_id = %s AND state = 'Creating' AND always_run = 0 AND cancelled = 0
 LIMIT %s;
 """,
-                    (batch['id'], remaining.value),
+                    (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                 ):
-                    record['batch_id'] = batch['id']
+                    record['batch_id'] = job_group['batch_id']
                     yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)
@@ -279,12 +281,13 @@ async def cancel_cancelled_running_jobs_loop_body(self):
         }
 
         async def user_cancelled_running_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id
-FROM batches
+SELECT job_groups.batch_id, job_groups.job_group_id, job_groups_cancelled.id IS NOT NULL AS cancelled
+FROM job_groups
 INNER JOIN job_groups_cancelled
-        ON batches.id = job_groups_cancelled.id
+  ON job_groups.batch_id = job_groups_cancelled.id AND
+     job_groups.job_group_id = job_groups_cancelled.job_group_id
 WHERE user = %s AND `state` = 'running';
 """,
                 (user,),
@@ -295,12 +298,12 @@ async def user_cancelled_running_jobs(user, remaining) -> AsyncIterator[Dict[str
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
 STRAIGHT_JOIN attempts
   ON attempts.batch_id = jobs.batch_id AND attempts.job_id = jobs.job_id
-WHERE jobs.batch_id = %s AND state = 'Running' AND always_run = 0 AND cancelled = 0
+WHERE jobs.batch_id = %s AND jobs.job_group_id = %s AND state = 'Running' AND always_run = 0 AND cancelled = 0
 LIMIT %s;
 """,
-                    (batch['id'], remaining.value),
+                    (job_group['batch_id'], job_group['job_group_id'], remaining.value),
                 ):
-                    record['batch_id'] = batch['id']
+                    record['batch_id'] = job_group['batch_id']
                     yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)

diff --git a/batch/batch/driver/instance_collection/job_private.py b/batch/batch/driver/instance_collection/job_private.py
@@ -179,12 +179,13 @@ async def schedule_jobs_loop_body(self):
         async for record in self.db.select_and_fetchall(
             """
 SELECT jobs.*, batches.format_version, batches.userdata, batches.user, attempts.instance_name, time_ready
-FROM batches
-INNER JOIN jobs ON batches.id = jobs.batch_id
+FROM job_groups
+LEFT JOIN batches ON batches.id = job_groups.batch_id
+LEFT JOIN jobs ON job_groups.batch_id = jobs.batch_id AND job_groups.job_group_id = jobs.job_group_id
 LEFT JOIN jobs_telemetry ON jobs.batch_id = jobs_telemetry.batch_id AND jobs.job_id = jobs_telemetry.job_id
 LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
 LEFT JOIN instances ON attempts.instance_name = instances.name
-WHERE batches.state = 'running'
+WHERE job_groups.state = 'running'
   AND jobs.state = 'Creating'
   AND (jobs.always_run OR NOT jobs.cancelled)
   AND jobs.inst_coll = %s
@@ -349,54 +350,55 @@ async def create_instances_loop_body(self):
         }
 
         async def user_runnable_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
-            async for batch in self.db.select_and_fetchall(
+            async for job_group in self.db.select_and_fetchall(
                 """
-SELECT batches.id, job_groups_cancelled.id IS NOT NULL AS cancelled, userdata, user, format_version
-FROM batches
+SELECT job_groups.batch_id, job_groups.job_group_id, job_groups_cancelled.id IS NOT NULL AS cancelled, userdata, job_groups.user, format_version
+FROM job_groups
+LEFT JOIN batches ON batches.id = job_groups.batch_id
 LEFT JOIN job_groups_cancelled
-       ON batches.id = job_groups_cancelled.id
-WHERE user = %s AND `state` = 'running';
+       ON job_groups.batch_id = job_groups_cancelled.id AND job_groups.job_group_id = job_groups_cancelled.job_group_id
+WHERE job_groups.user = %s AND job_groups.`state` = 'running';
 """,
                 (user,),
             ):
                 async for record in self.db.select_and_fetchall(
                     """
 SELECT jobs.batch_id, jobs.job_id, jobs.spec, jobs.cores_mcpu, regions_bits_rep, COALESCE(SUM(instances.state IS NOT NULL AND
-  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts
+  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts, jobs.job_group_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
 LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
 LEFT JOIN instances ON attempts.instance_name = instances.name
-WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 1 AND jobs.inst_coll = %s
+WHERE jobs.batch_id = %s AND jobs.job_group_id = %s AND jobs.state = 'Ready' AND always_run = 1 AND jobs.inst_coll = %s
 GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu
 HAVING live_attempts = 0
 LIMIT %s;
 """,
-                    (batch['id'], self.name, remaining.value),
+                    (job_group['batch_id'], job_group['job_group_id'], self.name, remaining.value),
                 ):
-                    record['batch_id'] = batch['id']
-                    record['userdata'] = batch['userdata']
-                    record['user'] = batch['user']
-                    record['format_version'] = batch['format_version']
+                    record['batch_id'] = job_group['batch_id']
+                    record['userdata'] = job_group['userdata']
+                    record['user'] = job_group['user']
+                    record['format_version'] = job_group['format_version']
                     yield record
-                if not batch['cancelled']:
+                if not job_group['cancelled']:
                     async for record in self.db.select_and_fetchall(
                         """
 SELECT jobs.batch_id, jobs.job_id, jobs.spec, jobs.cores_mcpu, regions_bits_rep, COALESCE(SUM(instances.state IS NOT NULL AND
-  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts
+  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts, job_group_id
 FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
 LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
 LEFT JOIN instances ON attempts.instance_name = instances.name
-WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 0 AND jobs.inst_coll = %s AND cancelled = 0
+WHERE jobs.batch_id = %s AND jobs.job_group_id = %s AND jobs.state = 'Ready' AND always_run = 0 AND jobs.inst_coll = %s AND cancelled = 0
 GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu
 HAVING live_attempts = 0
 LIMIT %s
 """,
-                        (batch['id'], self.name, remaining.value),
+                        (job_group['batch_id'], job_group['job_group_id'], self.name, remaining.value),
                     ):
-                        record['batch_id'] = batch['id']
-                        record['userdata'] = batch['userdata']
-                        record['user'] = batch['user']
-                        record['format_version'] = batch['format_version']
+                        record['batch_id'] = job_group['batch_id']
+                        record['userdata'] = job_group['userdata']
+                        record['user'] = job_group['user']
+                        record['format_version'] = job_group['format_version']
                         yield record
 
         waitable_pool = WaitableSharedPool(self.async_worker_pool)
@@ -420,6 +422,7 @@ async def user_runnable_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
                 id = (batch_id, job_id)
                 attempt_id = secret_alnum_string(6)
                 record['attempt_id'] = attempt_id
+                job_group_id = record['job_group_id']
 
                 if n_user_instances_created >= n_allocated_instances:
                     if random.random() > self.exceeded_shares_counter.rate():
@@ -435,7 +438,7 @@ async def user_runnable_jobs(user, remaining) -> AsyncIterator[Dict[str, Any]]:
                 log.info(f'creating job private instance for job {id}')
 
                 async def create_instance_with_error_handling(
-                    batch_id: int, job_id: int, attempt_id: str, record: dict, id: Tuple[int, int]
+                    batch_id: int, job_id: int, attempt_id: str, job_group_id: int, record: dict, id: Tuple[int, int]
                 ):
                     try:
                         batch_format_version = BatchFormatVersion(record['format_version'])
@@ -458,6 +461,7 @@ async def create_instance_with_error_handling(
                         await mark_job_errored(
                             self.app,
                             batch_id,
+                            job_group_id,
                             job_id,
                             attempt_id,
                             record['user'],
@@ -467,7 +471,9 @@ async def create_instance_with_error_handling(
                     except Exception:
                         log.exception(f'while creating job private instance for job {id}', exc_info=True)
 
-                await waitable_pool.call(create_instance_with_error_handling, batch_id, job_id, attempt_id, record, id)
+                await waitable_pool.call(
+                    create_instance_with_error_handling, batch_id, job_id, attempt_id, job_group_id, record, id
+                )
 
                 remaining.value -= 1
                 if remaining.value <= 0: