From 2f3e978ec001ae74efe1f000ce64339a6f080ef1 Mon Sep 17 00:00:00 2001 From: Lee Sarah Date: Thu, 14 Dec 2023 15:02:17 +0900 Subject: [PATCH] Update index.md --- content/post/DevCourse/day-44/index.md | 108 ++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/content/post/DevCourse/day-44/index.md b/content/post/DevCourse/day-44/index.md index d4af1ac..756ecb9 100644 --- a/content/post/DevCourse/day-44/index.md +++ b/content/post/DevCourse/day-44/index.md @@ -12,18 +12,124 @@ tags = [ ] +++ + + # ๐Ÿ“‹ย ๊ณต๋ถ€ ๋‚ด์šฉ ## Airflow ์‹ค์Šต : DAG ๊ตฌํ˜„ํ•˜๊ธฐ ## Primary Key Uniqueness ๋ณด์žฅํ•˜๊ธฐ +### ํ€ด์ฆˆ + +```python +# Weather_to_Redshift_v2.py + +INSERT INTO {schema}.{table} +SELECT date, temp, min_temp, max_temp FROM ( +SELECT *, ROW_NUMBER() OVER (PARTITION BY date ORDER BY created_date DESC) seq +FROM t +) +WHERE seq = 1; +``` + +> ์—ฌ๊ธฐ์„œ transaction์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์–ด์•ผ ํ•˜๋Š” ์ตœ์†Œ ๋ฒ”์œ„์˜ SQL๋“ค์€? + +### Upsert + +> Insert & Update + +- primary key๋ฅผ ๊ธฐ์ค€์œผ๋กœ + - ์กด์žฌํ•˜๋Š” ๋ ˆ์ฝ”๋“œ๋ผ๋ฉด, ์ƒˆ ์ •๋ณด๋กœ ์ˆ˜์ • + - ์กด์žฌํ•˜์ง€ ์•Š๋Š” ๋ ˆ์ฝ”๋“œ๋ผ๋ฉด ์ƒˆ ๋ ˆ์ฝ”๋“œ ์ ์žฌ +- DW๋งˆ๋‹ค UPSERT๋ฅผ ํšจ์œจ์ ์œผ๋กœ ์‹คํ–‰ํ•ด์ฃผ๋Š” ๋ฌธ๋ฒ•์„ ์ง€์›ํ•ด์คŒ + - [์ž์„ธํ•œ ์„ค๋ช…](#mysql-to-redshift-dag) + ## Backfill -### +> ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ์–ด์˜ค๋Š” ๋ฐ ์‹คํŒจํ•˜๊ฑฐ๋‚˜, ์ฝ์–ด์˜จ ๋ฐ์ดํ„ฐ์˜ ๋ฌธ์ œ ๋•Œ๋ฌธ์— ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ์„ ์žฌ์‹คํ–‰ํ•˜์—ฌ ๋‹ค์‹œ ์ฝ์–ด์™€์•ผ ํ•˜๋Š” ๊ณผ์ • + +### Incremental Update ์‹คํŒจ + +- ํ•˜๋ฃจ์— ํ•œ ๋ฒˆ ๋™์ž‘ํ•˜๋Š” incremental update +- ์ค‘๊ฐ„์— ๋ฉฐ์น ๋™์•ˆ ์ด ๊ณผ์ •์ด ์‹คํŒจํ•œ ๊ฒฝ์šฐ, ๊ทธ ์ดํ›„์˜ ์‹คํ–‰์—๋„ ์˜ํ–ฅ์„ ์ฃผ๊ฒŒ ๋˜์–ด์žˆ์Œ +- ์‹คํŒจํ•œ ๋ถ€๋ถ„์„ ์žฌ์‹คํ–‰ -> ์–ผ๋งˆ๋‚˜ ์ค‘์š”ํ•œ๊ฐ€? + +### Backfill์˜ ์šฉ์ด์„ฑ + +> ์‹คํŒจํ•œ ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ์˜ ์žฌ์‹คํ–‰์ด ์–ผ๋งˆ๋‚˜ ์šฉ์ดํ•œ ๊ตฌ์กฐ์ธ๊ฐ€? + +- full refresh + + - ๋ฌธ์ œ๊ฐ€ ์ƒ๊ธฐ๋ฉด ๋‹ค์‹œ ์‹คํ–‰ํ•˜๋ฉด ๋จ + - backfill ๋ถˆํ•„์š” + +- Incremental Update + - ๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค์‹œ ์ฝ์–ด์™€์•ผ ํ•˜๋ฉด ์ฒ˜์Œ๋ถ€ํ„ฐ ๋ชจ๋‘ ๋‹ค ์žฌ์‹คํ–‰ํ•ด์•ผ ํ•จ ( ํšจ์œจ์„ฑ์€ ๋” ์ข‹์„ ์ˆ˜ ์žˆ์ง€๋งŒ, ์šด์˜&์œ ์ง€๋ณด์ˆ˜๊ฐ€ ์–ด๋ ค์›Œ์ง) + - backfill ํ•„์š” + +> Airflow : backfill์„ ์‰ฝ๊ฒŒ ํ•  ์ˆ˜ ์žˆ๋„๋ก ๋””์ž์ธ๋จ + +### Backfill of Daily DAG + +#### Daily DAG + +- ์ง€๊ธˆ ์‹œ๊ฐ„์„ ๊ธฐ์ค€์œผ๋กœ ์–ด์ œ ๋‚ ์งœ๋ฅผ ๊ณ„์‚ฐ, ์–ด์ œ ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ์–ด์˜ด +- ๋งค์ผ ๋ฌธ์ œ ์—†์ด ๋™์ž‘ํ•˜๋ฉด OK, BUT ๋ฐ์ดํ„ฐ ์ฝ์–ด์˜ค๊ธฐ์— ์‹คํŒจํ•˜๋Š” ๊ฒฝ์šฐ ? -> ํŠน์ • ๋‚ ์งœ์˜ ๋ฐ์ดํ„ฐ๊ฐ€ ๋น ์ ธ์žˆ์Œ -> ์‹คํŒจํ•œ ๋‚  ๊ธฐ์ค€์œผ๋กœ ์ „๋‚ ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์—…๋ฐ์ดํŠธ ํ•˜๋Š” ์ฝ”๋“œ๋ฅผ ์ƒˆ๋กœ ์ž‘์„ฑํ•ด์•ผ ํ•จ (์›ํ•˜๋Š” ๋‚ ์งœ๋ฅผ ํ•˜๋“œ์ฝ”๋”ฉํ•˜๋Š” ๋ฐฉ์‹) + + ```python + from datetime import datetime, timedelta + # y = datetime.now() - timedelta(1) + # yesterday = datetime.strftime(y, '%Y-%m-%d') + yesterday = '2023-01-01' + ``` + +- ์‹ค์ˆ˜ํ•˜๊ธฐ ์‰ฝ๊ณ  ์ˆ˜์ •ํ•˜๋Š” ๋ฐ ์‹œ๊ฐ„์ด ๋งŽ์ด ๊ฑธ๋ฆผ + +**`DAG๋ฅผ ์ƒ์„ฑํ•  ๋•Œ ๋ถ€ํ„ฐ backfill์„ ์‰ฝ๊ฒŒ ๋งŒ๋“ค์–ด์•ผ ํ•จ`** + +### Backfill์„ ์šฉ์ดํ•˜๊ฒŒ ํ•˜๋Š” ๊ตฌ์กฐ + +- ๋‚ ์งœ๋ณ„๋กœ backfill ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋ก +- ๋‚ ์งœ๋Š” ์‹œ์Šคํ…œ์—์„œ ETL ์ธ์ž๋กœ ์ œ๊ณต +- ๋‚ ์งœ๋ฅผ ๋”ฐ๋กœ ๊ณ„์‚ฐํ•˜์ง€ ์•Š๊ณ , ์‹œ์Šคํ…œ์ด ์ •ํ•ด์ค€ ๋‚ ์งœ๋ฅผ ์‚ฌ์šฉ + +#### Airflow์˜ ๊ตฌ์กฐ + +- ETL๋ณ„๋กœ ์‹คํ–‰๋‚ ์งœ, ๊ฒฐ๊ณผ๋ฅผ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ DB์— ๊ธฐ๋ก +- ๋ชจ๋“  DAG ์‹คํ–‰์— `execution_date` ์ง€์ • +- `execution_date`๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐฑ์‹ ํ•˜๋„๋ก ์ฝ”๋“œ ์ž‘์„ฑ + +## DAG Parameter + +### date ๊ด€๋ จ parameter ์ •๋ฆฌ + +[Daily Incremental Update](#daily-dag) ๊ตฌํ˜„ + +#### start_date + +- 2020-11-07์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ์–ด์˜ด +- 2020-11-08 ๋ถ€ํ„ฐ ETL ๋™์ž‘ + + -> start_date : 2020-11-07 + +## MySQL to Redshift DAG # ๐Ÿ‘€ย CHECK _(์–ด๋ ต๊ฑฐ๋‚˜ ์ƒˆ๋กญ๊ฒŒ ์•Œ๊ฒŒ ๋œ ๊ฒƒ ๋“ฑ ๋‹ค์‹œ ํ™•์ธํ•  ๊ฒƒ๋“ค)_ +### openweathermap api + +- https://openweathermap.org/api/one-call-3 +- ๊ตฌ๋…ํ•œ ์ดํ›„์— ๋ฐ”๋กœ ํ—ˆ๊ฐ€๊ฐ€ ์•ˆ๋˜๋Š” ๋ฌธ์ œ๊ฐ€ ์žˆ์Œ.. +- ๊ธฐ์กด ์ฝ”๋“œ๋ฅผ 2.5 -> 3.0 ์œผ๋กœ ๋ฐ”๊ฟ”์•ผ ํ•จ + # โ— ๋Š๋‚€ ์ 