-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
52 changed files
with
7,967 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from airflow import DAG | ||
from airflow.operators.bash import BashOperator | ||
from airflow.operators.empty import EmptyOperator | ||
from airflow.operators.latest_only import LatestOnlyOperator | ||
from datetime import datetime | ||
from datetime import timedelta | ||
|
||
with DAG( | ||
dag_id='Learn_LatestOnlyOperator', | ||
schedule=timedelta(hours=48), # 매 48시간마다 실행되는 DAG로 설정 | ||
start_date=datetime(2023, 6, 14), | ||
catchup=True) as dag: | ||
|
||
t1 = EmptyOperator(task_id='task1') | ||
t2 = LatestOnlyOperator(task_id='latest_only') | ||
t3 = EmptyOperator(task_id='task3') | ||
t4 = EmptyOperator(task_id='task4') | ||
|
||
t1 >> t2 >> [t3, t4] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from airflow.models.dag import DAG | ||
from airflow.operators.empty import EmptyOperator | ||
from airflow.operators.bash import BashOperator | ||
from airflow.utils.task_group import TaskGroup | ||
import pendulum | ||
|
||
with DAG(dag_id="Learn_Task_Group", start_date=pendulum.today('UTC').add(days=-2), tags=["example"]) as dag: | ||
start = EmptyOperator(task_id="start") | ||
|
||
# Task Group #1 | ||
with TaskGroup("Download", tooltip="Tasks for downloading data") as section_1: | ||
task_1 = EmptyOperator(task_id="task_1") | ||
task_2 = BashOperator(task_id="task_2", bash_command='echo 1') | ||
task_3 = EmptyOperator(task_id="task_3") | ||
|
||
task_1 >> [task_2, task_3] | ||
|
||
# Task Group #2 | ||
with TaskGroup("Process", tooltip="Tasks for processing data") as section_2: | ||
task_1 = EmptyOperator(task_id="task_1") | ||
|
||
with TaskGroup("inner_section_2", tooltip="Tasks for inner_section2") as inner_section_2: | ||
task_2 = BashOperator(task_id="task_2", bash_command='echo 1') | ||
task_3 = EmptyOperator(task_id="task_3") | ||
task_4 = EmptyOperator(task_id="task_4") | ||
|
||
[task_2, task_3] >> task_4 | ||
|
||
end = EmptyOperator(task_id='end') | ||
|
||
start >> section_1 >> section_2 >> end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from airflow import DAG | ||
from airflow.operators.bash import BashOperator | ||
from airflow.utils.trigger_rule import TriggerRule | ||
from datetime import datetime, timedelta | ||
|
||
default_args = { | ||
'start_date': datetime(2023, 6, 15) | ||
} | ||
|
||
with DAG("Learn_TriggerRule", default_args=default_args, schedule=timedelta(1)) as dag: | ||
t1 = BashOperator(task_id="print_date", bash_command="date") | ||
t2 = BashOperator(task_id="sleep", bash_command="sleep 5") | ||
t3 = BashOperator(task_id="exit", bash_command="exit 1") | ||
t4 = BashOperator( | ||
task_id='final_task', | ||
bash_command='echo DONE!', | ||
trigger_rule=TriggerRule.ALL_DONE | ||
) | ||
[t1, t2, t3] >> t4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from airflow import DAG | ||
from airflow.operators.python import PythonOperator | ||
from airflow.providers.amazon.aws.transfers.sql_to_s3 import SqlToS3Operator | ||
from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator | ||
from airflow.models import Variable | ||
|
||
from datetime import datetime | ||
from datetime import timedelta | ||
|
||
import requests | ||
import logging | ||
import psycopg2 | ||
import json | ||
|
||
|
||
dag = DAG( | ||
dag_id = 'MySQL_to_Redshift', | ||
start_date = datetime(2022,8,24), # 날짜가 미래인 경우 실행이 안됨 | ||
schedule = '0 9 * * *', # 적당히 조절 | ||
max_active_runs = 1, | ||
catchup = False, | ||
default_args = { | ||
'retries': 1, | ||
'retry_delay': timedelta(minutes=3), | ||
} | ||
) | ||
|
||
schema = "zippoo94" | ||
table = "nps" | ||
s3_bucket = "grepp-data-engineering" | ||
s3_key = schema + "-" + table | ||
|
||
mysql_to_s3_nps = SqlToS3Operator( | ||
task_id = 'mysql_to_s3_nps', | ||
query = "SELECT * FROM prod.nps", | ||
s3_bucket = s3_bucket, | ||
s3_key = s3_key, | ||
sql_conn_id = "mysql_conn_id", | ||
aws_conn_id = "aws_conn_id", | ||
verify = False, | ||
replace = True, | ||
pd_kwargs={"index": False, "header": False}, | ||
dag = dag | ||
) | ||
|
||
s3_to_redshift_nps = S3ToRedshiftOperator( | ||
task_id = 's3_to_redshift_nps', | ||
s3_bucket = s3_bucket, | ||
s3_key = s3_key, | ||
schema = schema, | ||
table = table, | ||
copy_options=['csv'], | ||
method = 'REPLACE', | ||
redshift_conn_id = "redshift_dev_db", | ||
aws_conn_id = "aws_conn_id", | ||
dag = dag | ||
) | ||
|
||
mysql_to_s3_nps >> s3_to_redshift_nps |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
from airflow import DAG | ||
from airflow.operators.python import PythonOperator | ||
from airflow.providers.amazon.aws.transfers.sql_to_s3 import SqlToS3Operator | ||
from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator | ||
from airflow.models import Variable | ||
|
||
from datetime import datetime | ||
from datetime import timedelta | ||
|
||
import requests | ||
import logging | ||
import psycopg2 | ||
import json | ||
|
||
dag = DAG( | ||
dag_id = 'MySQL_to_Redshift_v2', | ||
start_date = datetime(2023,1,1), # 날짜가 미래인 경우 실행이 안됨 | ||
schedule = '0 9 * * *', # 적당히 조절 | ||
max_active_runs = 1, | ||
catchup = False, | ||
default_args = { | ||
'retries': 1, | ||
'retry_delay': timedelta(minutes=3), | ||
} | ||
) | ||
|
||
schema = "keeyong" | ||
table = "nps" | ||
s3_bucket = "grepp-data-engineering" | ||
s3_key = schema + "-" + table # s3_key = schema + "/" + table | ||
|
||
sql = "SELECT * FROM prod.nps WHERE DATE(created_at) = DATE('{{ execution_date }}')" | ||
print(sql) | ||
mysql_to_s3_nps = SqlToS3Operator( | ||
task_id = 'mysql_to_s3_nps', | ||
query = sql, | ||
s3_bucket = s3_bucket, | ||
s3_key = s3_key, | ||
sql_conn_id = "mysql_conn_id", | ||
aws_conn_id = "aws_conn_id", | ||
verify = False, | ||
replace = True, | ||
pd_kwargs={"index": False, "header": False}, | ||
dag = dag | ||
) | ||
|
||
s3_to_redshift_nps = S3ToRedshiftOperator( | ||
task_id = 's3_to_redshift_nps', | ||
s3_bucket = s3_bucket, | ||
s3_key = s3_key, | ||
schema = schema, | ||
table = table, | ||
copy_options=['csv'], | ||
redshift_conn_id = "redshift_dev_db", | ||
aws_conn_id = "aws_conn_id", | ||
method = "UPSERT", | ||
upsert_keys = ["id"], | ||
dag = dag | ||
) | ||
|
||
mysql_to_s3_nps >> s3_to_redshift_nps |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from airflow import DAG | ||
from airflow.operators.python import PythonOperator | ||
from datetime import datetime | ||
import requests | ||
import logging | ||
import psycopg2 | ||
|
||
def get_Redshift_connection(): | ||
host = "learnde.cduaw970ssvt.ap-northeast-2.redshift.amazonaws.com" | ||
user = "keeyong" # 본인 ID 사용 | ||
password = "..." # 본인 Password 사용 | ||
port = 5439 | ||
dbname = "dev" | ||
conn = psycopg2.connect(f"dbname={dbname} user={user} host={host} password={password} port={port}") | ||
conn.set_session(autocommit=True) | ||
return conn.cursor() | ||
|
||
|
||
def extract(url): | ||
logging.info("Extract started") | ||
f = requests.get(url) | ||
logging.info("Extract done") | ||
return (f.text) | ||
|
||
|
||
def transform(text): | ||
logging.info("Transform started") | ||
lines = text.strip().split("\n")[1:] # 첫 번째 라인을 제외하고 처리 | ||
records = [] | ||
for l in lines: | ||
(name, gender) = l.split(",") # l = "Keeyong,M" -> [ 'keeyong', 'M' ] | ||
records.append([name, gender]) | ||
logging.info("Transform ended") | ||
return records | ||
|
||
|
||
def load(records): | ||
logging.info("load started") | ||
""" | ||
records = [ | ||
[ "Keeyong", "M" ], | ||
[ "Claire", "F" ], | ||
... | ||
] | ||
""" | ||
schema = "keeyong" | ||
# BEGIN과 END를 사용해서 SQL 결과를 트랜잭션으로 만들어주는 것이 좋음 | ||
cur = get_Redshift_connection() | ||
try: | ||
cur.execute("BEGIN;") | ||
cur.execute(f"DELETE FROM {schema}.name_gender;") | ||
# DELETE FROM을 먼저 수행 -> FULL REFRESH을 하는 형태 | ||
for r in records: | ||
name = r[0] | ||
gender = r[1] | ||
print(name, "-", gender) | ||
sql = f"INSERT INTO {schema}.name_gender VALUES ('{name}', '{gender}')" | ||
cur.execute(sql) | ||
cur.execute("COMMIT;") # cur.execute("END;") | ||
except (Exception, psycopg2.DatabaseError) as error: | ||
print(error) | ||
cur.execute("ROLLBACK;") | ||
logging.info("load done") | ||
|
||
|
||
def etl(): | ||
link = "https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv" | ||
data = extract(link) | ||
lines = transform(data) | ||
load(lines) | ||
|
||
|
||
dag_second_assignment = DAG( | ||
dag_id = 'name_gender', | ||
catchup = False, | ||
start_date = datetime(2023,4,6), # 날짜가 미래인 경우 실행이 안됨 | ||
schedule = '0 2 * * *') # 적당히 조절 | ||
|
||
task = PythonOperator( | ||
task_id = 'perform_etl', | ||
python_callable = etl, | ||
dag = dag_second_assignment) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
from airflow import DAG | ||
from airflow.operators.python import PythonOperator | ||
from airflow.models import Variable | ||
|
||
from datetime import datetime | ||
from datetime import timedelta | ||
import requests | ||
import logging | ||
import psycopg2 | ||
|
||
|
||
def get_Redshift_connection(): | ||
host = "learnde.cduaw970ssvt.ap-northeast-2.redshift.amazonaws.com" | ||
redshift_user = "keeyong" # 본인 ID 사용 | ||
redshift_pass = "..." # 본인 Password 사용 | ||
port = 5439 | ||
dbname = "dev" | ||
conn = psycopg2.connect(f"dbname={dbname} user={redshift_user} host={host} password={redshift_pass} port={port}") | ||
conn.set_session(autocommit=True) | ||
return conn.cursor() | ||
|
||
|
||
def extract(url): | ||
logging.info("Extract started") | ||
f = requests.get(url) | ||
logging.info("Extract done") | ||
return (f.text) | ||
|
||
|
||
def transform(text): | ||
logging.info("Transform started") | ||
lines = text.strip().split("\n")[1:] # 첫 번째 라인을 제외하고 처리 | ||
records = [] | ||
for l in lines: | ||
(name, gender) = l.split(",") # l = "Keeyong,M" -> [ 'keeyong', 'M' ] | ||
records.append([name, gender]) | ||
logging.info("Transform ended") | ||
return records | ||
|
||
|
||
def load(records): | ||
logging.info("load started") | ||
""" | ||
records = [ | ||
[ "Keeyong", "M" ], | ||
[ "Claire", "F" ], | ||
... | ||
] | ||
""" | ||
schema = "keeyong" | ||
# BEGIN과 END를 사용해서 SQL 결과를 트랜잭션으로 만들어주는 것이 좋음 | ||
cur = get_Redshift_connection() | ||
try: | ||
cur.execute("BEGIN;") | ||
cur.execute(f"DELETE FROM {schema}.name_gender;") | ||
# DELETE FROM을 먼저 수행 -> FULL REFRESH을 하는 형태 | ||
for r in records: | ||
name = r[0] | ||
gender = r[1] | ||
print(name, "-", gender) | ||
sql = f"INSERT INTO {schema}.name_gender VALUES ('{name}', '{gender}')" | ||
cur.execute(sql) | ||
cur.execute("COMMIT;") # cur.execute("END;") | ||
except (Exception, psycopg2.DatabaseError) as error: | ||
print(error) | ||
cur.execute("ROLLBACK;") | ||
logging.info("load done") | ||
|
||
def etl(**context): | ||
link = context["params"]["url"] | ||
# task 자체에 대한 정보 (일부는 DAG의 정보가 되기도 함)를 읽고 싶다면 context['task_instance'] 혹은 context['ti']를 통해 가능 | ||
# https://airflow.readthedocs.io/en/latest/_api/airflow/models/taskinstance/index.html#airflow.models.TaskInstance | ||
task_instance = context['task_instance'] | ||
execution_date = context['execution_date'] | ||
|
||
logging.info(execution_date) | ||
|
||
data = extract(link) | ||
lines = transform(data) | ||
load(lines) | ||
|
||
|
||
dag = DAG( | ||
dag_id = 'name_gender_v2', | ||
start_date = datetime(2023,4,6), # 날짜가 미래인 경우 실행이 안됨 | ||
schedule = '0 2 * * *', # 적당히 조절 | ||
catchup = False, | ||
max_active_runs = 1, | ||
default_args = { | ||
'retries': 1, | ||
'retry_delay': timedelta(minutes=3), | ||
} | ||
) | ||
|
||
|
||
task = PythonOperator( | ||
task_id = 'perform_etl', | ||
python_callable = etl, | ||
params = { | ||
'url': "https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv" | ||
}, | ||
dag = dag) |
Oops, something went wrong.