Skip to content

Commit 40a70cf

Browse files
test: performance test CI work (#8761)
1 parent 36a2e29 commit 40a70cf

File tree

8 files changed

+571
-211
lines changed

8 files changed

+571
-211
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
stages:
2+
- master:
3+
pre:
4+
- sh: make -C tools prep-root
5+
config_file:
6+
db:
7+
host: localhost # Host is localhost since we connect through ssh forwarding.
8+
port: 5432
9+
user: $PERF_DB_USER
10+
password: $PERF_DB_PASS
11+
name: postgres
12+
ssl_mode: require
13+
checkpoint_storage:
14+
type: shared_fs
15+
host_path: /tmp
16+
storage_path: determined-cp
17+
log:
18+
level: debug
19+
root: tools/build
20+
cache:
21+
cache_dir: /tmp/determined-cache
22+
launch_error: false
23+
security:
24+
authz:
25+
rbac_ui_enabled: true
26+
resource_manager:
27+
type: agent
28+
default_aux_resource_pool: default
29+
default_compute_resource_pool: default
30+
resource_pools:
31+
- pool_name: default

.circleci/real_config.yml

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2132,6 +2132,172 @@ jobs:
21322132
path: /tmp/priority_scheduler
21332133
destination: devcluster-priority_scheduler-logs
21342134

2135+
test-perf:
2136+
parameters:
2137+
snapshot-after-migrations:
2138+
type: boolean
2139+
default: false
2140+
deploy-db:
2141+
type: boolean
2142+
default: false
2143+
machine:
2144+
image: <<pipeline.parameters.machine-image>>
2145+
resource_class: xlarge
2146+
steps:
2147+
- queue/until_front_of_line:
2148+
only-on-branch: main
2149+
time: "120" # Wait two hours at most. Adjust this over time.
2150+
- checkout
2151+
- attach_workspace:
2152+
at: .
2153+
- setup-python-venv:
2154+
executor: <<pipeline.parameters.machine-image>>
2155+
- install-devcluster
2156+
- run:
2157+
name: Install upload deps
2158+
command: tools/scripts/retry.sh pip install requests determined psycopg2-binary
2159+
2160+
- when:
2161+
condition: <<parameters.deploy-db>>
2162+
steps:
2163+
- run:
2164+
name: Select snapshot to use
2165+
command: |
2166+
echo 'export PERF_SNAPSHOT_TO_USE="perf-test-base-snapshot"' >> "$BASH_ENV"
2167+
2168+
SNAPSHOT_COMMITS=$(aws rds describe-db-snapshots \
2169+
--region="us-west-2" \
2170+
--query "DBSnapshots[?TagList[?Key=='ci-snapshot']].DBSnapshotIdentifier" \
2171+
--output json | jq -r '.[] | split("-")[3]')
2172+
echo "Snapshot commits (${SNAPSHOT_COMMITS})"
2173+
2174+
for ((n=0; n<=1000; n++)); do
2175+
COMMIT=$(git log --format="%H" -n 1 --skip=$n)
2176+
2177+
if [[ " $SNAPSHOT_COMMITS " =~ .*"$COMMIT".* ]]; then
2178+
echo "export PERF_SNAPSHOT_TO_USE=\"ci-snapshot-commit-${COMMIT}\"" >> "$BASH_ENV"
2179+
break
2180+
fi
2181+
done
2182+
2183+
source $BASH_ENV
2184+
echo "Deciding to use $PERF_SNAPSHOT_TO_USE"
2185+
- run:
2186+
name: Wait for snapshot to be available
2187+
command: |
2188+
aws rds wait db-snapshot-available \
2189+
--region "us-west-2" \
2190+
--db-snapshot-identifier "${PERF_SNAPSHOT_TO_USE}"
2191+
- run:
2192+
name: Deploy database
2193+
command: |
2194+
aws rds restore-db-instance-from-db-snapshot \
2195+
--region="us-west-2" \
2196+
--db-snapshot-identifier="${PERF_SNAPSHOT_TO_USE}" \
2197+
--db-instance-identifier="ci-perf-db-${CIRCLE_BUILD_NUM}" \
2198+
--no-multi-az \
2199+
--no-publicly-accessible \
2200+
--no-auto-minor-version-upgrade \
2201+
--db-parameter-group-name="logquerieslong" \
2202+
--tags "Key=ci-snapshot" \
2203+
--vpc-security-group-ids="${PERF_DB_SECURITY_GROUP_ID}" \
2204+
no_output_timeout: 30m
2205+
- run:
2206+
name: Get db instance host
2207+
command: |
2208+
echo "export RDS_HOST=$(aws rds describe-db-instances \
2209+
--region us-west-2 \
2210+
--db-instance-identifier "ci-perf-db-${CIRCLE_BUILD_NUM}" \
2211+
--query "DBInstances[0].Endpoint.Address" \
2212+
--output text)" >> "$BASH_ENV"
2213+
source $BASH_ENV
2214+
echo "perf db host ${PERF_DB_HOST}"
2215+
- run:
2216+
name: Wait for database to be ready
2217+
command: |
2218+
aws rds wait db-instance-available \
2219+
--region="us-west-2" \
2220+
--db-instance-identifier="ci-perf-db-${CIRCLE_BUILD_NUM}"
2221+
2222+
- run:
2223+
name: Add SSH key
2224+
command: echo "${PERF_DB_BASTION_SSH_KEY}" | base64 --decode | ssh-add -
2225+
- run:
2226+
name: Port forward to bastion instance
2227+
command: ssh -L 5432:${PERF_DB_HOST}:5432 -N -f ubuntu@$PERF_DB_BASTION_HOST
2228+
- start-devcluster:
2229+
target-stage: master
2230+
devcluster-config: perftest.devcluster.yaml
2231+
- run:
2232+
name: Wait and record any migrations ran
2233+
command: python .circleci/scripts/wait_for_perf_migration_upload_results.py
2234+
2235+
- when:
2236+
condition: <<parameters.snapshot-after-migrations>>
2237+
steps:
2238+
- run:
2239+
name: Take and wait for RDS snapshot, only on main and when migrations were applied
2240+
command: |
2241+
if [ -f /tmp/no-migrations-needed ]; then
2242+
echo "/tmp/no-migrations-needed exists, no need to take a snapshot"
2243+
exit 0
2244+
fi
2245+
2246+
COMMIT=$(git log -1 --pretty=format:%H)
2247+
echo "Taking snapshot"
2248+
aws rds create-db-snapshot \
2249+
--region="us-west-2" \
2250+
--db-instance-identifier="${PERF_DB_AWS_NAME}" \
2251+
--db-snapshot-identifier="ci-snapshot-commit-${COMMIT}" \
2252+
--tags "Key=ci-snapshot"
2253+
2254+
echo "Snapshot taken now waiting for it to become completed"
2255+
aws rds wait db-snapshot-completed \
2256+
--region="us-west-2" \
2257+
--db-snapshot-identifier="ci-snapshot-commit-${COMMIT}"
2258+
echo "Snapshot completed"
2259+
- run:
2260+
name: Build performance test Docker image
2261+
command: make -C performance build
2262+
- run:
2263+
name: Run performance test
2264+
command: |
2265+
export PERF_DOCKER_FLAGS="--network=host"
2266+
export PERF_K6_FLAGS='-e DET_ADMIN_USERNAME="admin" \
2267+
-e DET_ADMIN_PASSWORD="" \
2268+
-e model_name="tnjpuojqzbluqiyyqilftulsw" \
2269+
-e model_version_number="1" \
2270+
-e trial_id="8282" \
2271+
-e experiment_id="100" \
2272+
-e task_id="backported.8282" \
2273+
-e metric_name="85c9" \
2274+
-e metric_type="METRIC_TYPE_TRAINING" \
2275+
-e batches="1800" \
2276+
-e batches_margin="99" \
2277+
-e resource_pool="default"'
2278+
make -C performance run
2279+
- run:
2280+
name: Upload result of performance test to Postgres result db
2281+
command: python .circleci/scripts/upload_perf_results.py ./performance/reports/latest.results.json
2282+
2283+
- when:
2284+
condition: <<parameters.deploy-db>>
2285+
when: always
2286+
steps:
2287+
- run:
2288+
name: Delete RDS instance
2289+
command: |
2290+
aws rds delete-db-instance \
2291+
--region="us-west-2" \
2292+
--db-instance-identifier="ci-perf-db-${CIRCLE_BUILD_NUM}" \
2293+
--skip-final-snapshot
2294+
2295+
- slack/status:
2296+
fail_only: false
2297+
only_for_branches: main
2298+
failure_message: ':thisisfine: A \`${CIRCLE_JOB}\` job on branch \`${CIRCLE_BRANCH}\` has failed!'
2299+
mentions: "U03CP4ZKY2D" # Ping Nick Blaskey for now. Eventually switch this to perf team.
2300+
21352301
deploy:
21362302
parameters:
21372303
compute-agent-instance-type:
@@ -2752,6 +2918,20 @@ workflows:
27522918
target-stage: agent
27532919
wait-for-master: false
27542920

2921+
- test-perf:
2922+
name: test-perf
2923+
snapshot-after-migrations: true
2924+
deploy-db: false
2925+
requires:
2926+
- build-go
2927+
context:
2928+
- perf-tests
2929+
- aws
2930+
filters:
2931+
branches:
2932+
only:
2933+
- main
2934+
27552935
- deploy:
27562936
name: deploy-latest-master-cluster
27572937
enable-cors: true
@@ -3125,6 +3305,27 @@ workflows:
31253305
aux-agent-instance-type: ["m5.large"]
31263306
max-dynamic-agents: [2]
31273307

3308+
# Perf tests.
3309+
- request-perf-tests:
3310+
type: approval
3311+
filters: *upstream-feature-branch
3312+
3313+
- build-go:
3314+
requires:
3315+
- request-perf-tests
3316+
3317+
- test-perf:
3318+
name: test-perf-feature-branch
3319+
snapshot-after-migrations: false
3320+
deploy-db: true
3321+
requires:
3322+
- build-go
3323+
- request-perf-tests
3324+
context:
3325+
- perf-tests
3326+
- aws
3327+
filters: *upstream-feature-branch
3328+
31283329
# Nightly tests
31293330
- request-gpu-nightly:
31303331
type: approval

0 commit comments

Comments
 (0)