Skip to content

Commit

Permalink
Enable copy-on-write in pandas
Browse files Browse the repository at this point in the history
Testing out the new behavior that will be the default in Pandas 3.0

https://pandas.pydata.org/docs/dev/whatsnew/v2.2.0.html#copy-on-write
  • Loading branch information
samdoran committed Mar 18, 2024
1 parent 4f12cbc commit d464c7f
Show file tree
Hide file tree
Showing 24 changed files with 98 additions and 0 deletions.
49 changes: 49 additions & 0 deletions deploy/clowdapp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ objects:
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
image: ${IMAGE}:${IMAGE_TAG}
initContainers:
- command:
Expand Down Expand Up @@ -313,6 +315,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_API_ENABLE_SENTRY
Expand Down Expand Up @@ -524,6 +528,8 @@ objects:
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down Expand Up @@ -669,6 +675,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -863,6 +871,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down Expand Up @@ -1038,6 +1048,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down Expand Up @@ -1194,6 +1206,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1382,6 +1396,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1576,6 +1592,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1772,6 +1790,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -1970,6 +1990,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2166,6 +2188,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2362,6 +2386,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2560,6 +2586,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2762,6 +2790,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -2962,6 +2992,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3156,6 +3188,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3350,6 +3384,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3548,6 +3584,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3748,6 +3786,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -3942,6 +3982,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -4100,6 +4142,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down Expand Up @@ -4577,6 +4621,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
5 changes: 5 additions & 0 deletions deploy/kustomize/base/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/koku.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
livenessProbe:
httpGet:
path: ${API_PATH_PREFIX}/v1/status/
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_API_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/masu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/scheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-client.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_API_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-celery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-hcs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-subs-extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_CELERY_ENABLE_SENTRY
Expand Down
Loading

0 comments on commit d464c7f

Please sign in to comment.