Skip to content

Commit

Permalink
Enable copy-on-write in pandas
Browse files Browse the repository at this point in the history
Testing out the new behavior that will be the default in Pandas 3.0

https://pandas.pydata.org/docs/dev/whatsnew/v2.2.0.html#copy-on-write
  • Loading branch information
samdoran committed Mar 28, 2024
1 parent 0f1ac75 commit 6c35cf4
Show file tree
Hide file tree
Showing 24 changed files with 98 additions and 0 deletions.
49 changes: 49 additions & 0 deletions deploy/clowdapp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ objects:
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
image: ${IMAGE}:${IMAGE_TAG}
initContainers:
- command:
Expand Down Expand Up @@ -313,6 +315,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_API_ENABLE_SENTRY
Expand Down Expand Up @@ -524,6 +528,8 @@ objects:
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down Expand Up @@ -669,6 +675,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -865,6 +873,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down Expand Up @@ -1040,6 +1050,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down Expand Up @@ -1196,6 +1208,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1384,6 +1398,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1578,6 +1594,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1774,6 +1792,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1972,6 +1992,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2168,6 +2190,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2364,6 +2388,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2562,6 +2588,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2764,6 +2792,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2964,6 +2994,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3158,6 +3190,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3352,6 +3386,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3550,6 +3586,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3750,6 +3788,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3944,6 +3984,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -4102,6 +4144,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -4579,6 +4623,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
5 changes: 5 additions & 0 deletions deploy/kustomize/base/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/koku.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
livenessProbe:
httpGet:
path: ${API_PATH_PREFIX}/v1/status/
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_API_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/masu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/scheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-client.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-celery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-hcs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-subs-extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
Loading

0 comments on commit 6c35cf4

Please sign in to comment.