Skip to content

Commit

Permalink
Enable copy-on-write in pandas
Browse files Browse the repository at this point in the history
Testing out the new behavior that will be the default in Pandas 3.0

https://pandas.pydata.org/docs/dev/whatsnew/v2.2.0.html#copy-on-write
  • Loading branch information
samdoran committed May 7, 2024
1 parent f6e7ebb commit b01e60d
Show file tree
Hide file tree
Showing 24 changed files with 98 additions and 0 deletions.
49 changes: 49 additions & 0 deletions deploy/clowdapp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ objects:
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
image: ${IMAGE}:${IMAGE_TAG}
initContainers:
- command:
Expand Down Expand Up @@ -297,6 +299,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -496,6 +500,8 @@ objects:
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down Expand Up @@ -629,6 +635,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -813,6 +821,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -976,6 +986,8 @@ objects:
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1120,6 +1132,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1296,6 +1310,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1478,6 +1494,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1662,6 +1680,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -1848,6 +1868,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2032,6 +2054,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2216,6 +2240,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2402,6 +2428,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2592,6 +2620,8 @@ objects:
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2780,6 +2810,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -2962,6 +2994,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3144,6 +3178,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3330,6 +3366,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3518,6 +3556,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3700,6 +3740,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -3874,6 +3916,8 @@ objects:
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down Expand Up @@ -4355,6 +4399,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
5 changes: 5 additions & 0 deletions deploy/kustomize/base/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,11 @@ parameters:
name: PANDAS_COLUMN_BATCH_SIZE
required: true
value: "250"
- description: Enable copy-on-write in Pandas
displayName: Pandas copy-on-write
name: PANDAS_COPY_ON_WRITE
required: true
value: "1"
- description: Processing batch size
displayName: Processing batch size
name: REPORT_PROCESSING_BATCH_SIZE
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/koku.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@
value: ${CACHE_TIMEOUT}
- name: TAG_ENABLED_LIMIT
value: ${TAG_ENABLED_LIMIT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
livenessProbe:
httpGet:
path: ${API_PATH_PREFIX}/v1/status/
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/masu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@
value: ${TAG_ENABLED_LIMIT}
- name: KAFKA_CONNECT
value: ${KAFKA_CONNECT}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_PUSHGATEWAY
value: ${PROMETHEUS_PUSHGATEWAY}
- name: UNLEASH_CACHE_DIR
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/scheduler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-client.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/sources-listener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
value: ${RBAC_SERVICE_PATH}
- name: RBAC_CACHE_TTL
value: ${RBAC_CACHE_TTL}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: PROMETHEUS_MULTIPROC_DIR
value: ${PROMETHEUS_DIR}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-celery.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-cost-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-hcs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-ocp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-priority.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COLUMN_BATCH_SIZE
value: ${PANDAS_COLUMN_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh-xl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
2 changes: 2 additions & 0 deletions deploy/kustomize/patches/worker-subs-extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
value: ${ENABLE_S3_ARCHIVING}
- name: PARQUET_PROCESSING_BATCH_SIZE
value: ${PARQUET_PROCESSING_BATCH_SIZE}
- name: PANDAS_COPY_ON_WRITE
value: ${PANDAS_COPY_ON_WRITE}
- name: TRINO_DATE_STEP
value: ${TRINO_DATE_STEP}
- name: KOKU_ENABLE_SENTRY
Expand Down
Loading

0 comments on commit b01e60d

Please sign in to comment.