Skip to content

Commit

Permalink
Enable copy-on-write in pandas
Browse files Browse the repository at this point in the history
Testing out the new behavior that will be the default in Pandas 3.0

https://pandas.pydata.org/docs/dev/whatsnew/v2.2.0.html#copy-on-write
  • Loading branch information
samdoran committed Apr 4, 2024
1 parent 67bb9f3 commit efcfec3
Show file tree
Hide file tree
Showing 24 changed files with 98 additions and 0 deletions.
49 changes: 49 additions & 0 deletions deploy/clowdapp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ objects:
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
image: ${IMAGE}:${IMAGE_TAG}
initContainers:
- command:
Expand Down Expand Up @@ -315,6 +317,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -528,6 +532,8 @@ objects:
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down Expand Up @@ -673,6 +679,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -877,6 +885,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1054,6 +1064,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1212,6 +1224,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1408,6 +1422,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1610,6 +1626,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1814,6 +1832,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2020,6 +2040,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2224,6 +2246,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2428,6 +2452,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2634,6 +2660,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2844,6 +2872,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3052,6 +3082,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3254,6 +3286,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3456,6 +3490,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3662,6 +3698,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3870,6 +3908,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -4072,6 +4112,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -4238,6 +4280,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -4715,6 +4759,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
5 changes: 5 additions & 0 deletions deploy/kustomize/base/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/koku.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
livenessProbe:
httpGet:
path: ${API_PATH_PREFIX}/v1/status/
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/masu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/scheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-client.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-celery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-hcs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-subs-extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
Loading

0 comments on commit efcfec3

Please sign in to comment.