1
1
from datetime import datetime , timedelta
2
2
3
3
from airflow import DAG
4
+ from airflow .models import Variable
4
5
5
6
# from airflow.operators.python import PythonOperator
6
7
from airflow .providers .amazon .aws .operators .glue import GlueJobOperator
8
+ from airflow .providers .amazon .aws .operators .glue_crawler import GlueCrawlerOperator
7
9
from airflow .providers .amazon .aws .hooks .s3 import S3Hook
8
10
9
11
from airflow .providers .amazon .aws .sensors .glue import GlueJobSensor
@@ -54,20 +56,6 @@ def upload_rendered_script_to_s3(
54
56
day = "{{ (data_interval_end - macros.timedelta(hours=1)).in_timezone('Asia/Seoul').day }}"
55
57
hour = "{{ (data_interval_end - macros.timedelta(hours=1)).in_timezone('Asia/Seoul').hour }}" # before 1 hour
56
58
57
- # upload_script = PythonOperator(
58
- # task_id="upload_script_to_s3",
59
- # python_callable=upload_rendered_script_to_s3,
60
- # op_kwargs={
61
- # "bucket_name": bucket_name,
62
- # "aws_conn_id": "aws_conn_id",
63
- # "template_s3_key": "source/script/glue_game_ccu_template.py",
64
- # "rendered_s3_key": "source/script/glue_game_ccu_script.py",
65
- # # into template
66
- # "input_path": f"s3://de-2-1-bucket/source/json/table_name=raw_game_ccu/year={year}/month={month}/day={day}/hour={hour}/",
67
- # "output_path": f"s3://de-2-1-bucket/source/parquet/table_name=raw_game_ccu/year={year}/month={month}/day={day}/hour={hour}/",
68
- # },
69
- # )
70
-
71
59
run_glue_job = GlueJobOperator (
72
60
task_id = "run_glue_job" ,
73
61
job_name = "de-2-1_game_ccu" ,
@@ -89,6 +77,24 @@ def upload_rendered_script_to_s3(
89
77
run_id = run_glue_job .output ,
90
78
aws_conn_id = "aws_conn_id" ,
91
79
)
80
+ glue_crawler_arn = Variable .get ("glue_crawler_arn_secret" )
81
+ glue_crawler_config = {
82
+ "Name" : "de-2-1-raw_game_ccu" ,
83
+ "Role" : glue_crawler_arn ,
84
+ "DatabaseName" : "de_2_1_glue" ,
85
+ "Targets" : {
86
+ "S3Targets" : [
87
+ {
88
+ "Path" : "s3://de-2-1-bucket/source/parquet/table_name=raw_live_viewer/"
89
+ }
90
+ ]
91
+ },
92
+ }
92
93
94
+ crawl_s3 = GlueCrawlerOperator (
95
+ task_id = "crawl_s3" ,
96
+ config = glue_crawler_config ,
97
+ aws_conn_id = "aws_conn_id" ,
98
+ )
93
99
# upload_script >> run_glue_job >> wait_for_job
94
- run_glue_job >> wait_for_job
100
+ run_glue_job >> wait_for_job >> crawl_s3
0 commit comments