Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
bundle:
name: wal-chain-test

resources:
jobs:
# Linear chain: job_01 -> job_02 -> ... -> job_10
# Execution order: job_01 first, job_10 last
job_01:
name: "job-01"
description: "first in chain"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_02:
name: "job-02"
description: "depends on ${resources.jobs.job_01.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_03:
name: "job-03"
description: "depends on ${resources.jobs.job_02.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_04:
name: "job-04"
description: "depends on ${resources.jobs.job_03.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_05:
name: "job-05"
description: "depends on ${resources.jobs.job_04.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_06:
name: "job-06"
description: "depends on ${resources.jobs.job_05.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_07:
name: "job-07"
description: "depends on ${resources.jobs.job_06.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_08:
name: "job-08"
description: "depends on ${resources.jobs.job_07.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_09:
name: "job-09"
description: "depends on ${resources.jobs.job_08.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_10:
name: "job-10"
description: "depends on ${resources.jobs.job_09.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
5 changes: 5 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

73 changes: 73 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
=== First deploy (crashes on job_10) ===

>>> errcode [CLI] bundle deploy
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
Deploying resources...
[PROCESS_KILLED]

Exit code: [KILLED]

=== WAL content after crash ===
{"lineage":"[UUID]","serial": [SERIAL]}
{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}}
{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}}
{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}}
{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}}
{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}}
{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}}
{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}}
{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}}
{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}}

=== Number of jobs saved in WAL ===
9

=== Bundle summary (reads from WAL) ===
Name: wal-chain-test
Target: default
Workspace:
User: [USERNAME]
Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default
Resources:
Jobs:
job_01:
Name: job-01
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_02:
Name: job-02
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_03:
Name: job-03
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_04:
Name: job-04
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_05:
Name: job-05
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_06:
Name: job-06
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_07:
Name: job-07
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_08:
Name: job-08
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_09:
Name: job-09
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_10:
Name: job-10
URL: (not deployed)

=== Second deploy (recovery) ===

>>> [CLI] bundle deploy --force-lock
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

=== WAL after successful deploy ===
WAL deleted (expected)
22 changes: 22 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
echo "=== First deploy (crashes on job_10) ==="
trace errcode $CLI bundle deploy

echo ""
echo "=== WAL content after crash ==="
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file"

echo ""
echo "=== Number of jobs saved in WAL ==="
grep -c '"k":"resources.jobs' .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "0"

echo ""
echo "=== Bundle summary (reads from WAL) ==="
$CLI bundle summary

echo ""
echo "=== Second deploy (recovery) ==="
trace $CLI bundle deploy --force-lock

echo ""
echo "=== WAL after successful deploy ==="
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)"
1 change: 1 addition & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
print("test")
17 changes: 17 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Linear chain: job_01 -> job_02 -> ... -> job_10
# Let first 9 jobs/create succeed, then kill on the 10th

[[Server]]
Pattern = "POST /api/2.2/jobs/create"
KillCallerOffset = 9
KillCaller = 1
Response.Body = '{"job_id": 1001}'

[[Server]]
Pattern = "POST /api/2.2/jobs/reset"
Response.Body = '{}'

[[Server]]
Pattern = "GET /api/2.2/jobs/get"
Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}'

25 changes: 25 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
bundle:
name: wal-corrupted-test

resources:
jobs:
valid_job:
name: "valid-job"
tasks:
- task_key: "task-a"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
another_valid:
name: "another-valid"
tasks:
- task_key: "task-b"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
=== Creating state file with serial 5 ===
=== Creating WAL with corrupted LAST entry ===
=== WAL content ===
{"lineage":"test-lineage-123","serial": [SERIAL]}
{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}}
{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}}
not valid json - corrupted last line (partial write from crash)
=== Deploy (should recover valid entries, skip corrupted last line) ===

>>> [CLI] bundle deploy
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!
=== Final state (should have recovered entries) ===
{
"serial": [SERIAL],
"state_keys": [
"resources.jobs.another_valid",
"resources.jobs.valid_job"
]
}
=== WAL after successful deploy ===
WAL deleted (expected)
37 changes: 37 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
echo "=== Creating state file with serial 5 ==="
mkdir -p .databricks/bundle/default
cat > .databricks/bundle/default/resources.json << 'EOF'
{
"state_version": 1,
"cli_version": "0.0.0",
"lineage": "test-lineage-123",
"serial": 5,
"state": {}
}
EOF

echo "=== Creating WAL with corrupted LAST entry ==="
# Corrupted last line is expected (partial write from crash) and should be skipped.
# Valid entries before it should be recovered.
cat > .databricks/bundle/default/resources.json.wal << 'EOF'
{"lineage":"test-lineage-123","serial":6}
{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}}
{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}}
not valid json - corrupted last line (partial write from crash)
EOF

echo "=== WAL content ==="
cat .databricks/bundle/default/resources.json.wal

echo "=== Deploy (should recover valid entries, skip corrupted last line) ==="
trace $CLI bundle deploy 2>&1

echo "=== Final state (should have recovered entries) ==="
cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}'

echo "=== WAL after successful deploy ==="
if [ -f ".databricks/bundle/default/resources.json.wal" ]; then
echo "WAL exists (unexpected)"
else
echo "WAL deleted (expected)"
fi
1 change: 1 addition & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
print("test")
14 changes: 14 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# WAL with corrupted LAST entry - valid entries should be recovered, corrupted last line skipped.

[[Server]]
Pattern = "POST /api/2.2/jobs/reset"
Response.Body = '{}'

[[Server]]
Pattern = "GET /api/2.2/jobs/get?job_id=1111"
Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}'

[[Server]]
Pattern = "GET /api/2.2/jobs/get?job_id=2222"
Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}'

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
bundle:
name: wal-corrupted-middle-test

resources:
jobs:
job_one:
name: "job-one"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_two:
name: "job-two"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading