Skip to content

Commit 04fcb6d

Browse files
authored
Merge pull request #127 from zizzic/feature/glue_crawler
[feat]: add crawler-operator to dag
2 parents cf2e68d + 6964eac commit 04fcb6d

File tree

2 files changed

+28
-22
lines changed

2 files changed

+28
-22
lines changed

dags/glue/game_ccu.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from datetime import datetime, timedelta
22

33
from airflow import DAG
4+
from airflow.models import Variable
45

56
# from airflow.operators.python import PythonOperator
67
from airflow.providers.amazon.aws.operators.glue import GlueJobOperator
8+
from airflow.providers.amazon.aws.operators.glue_crawler import GlueCrawlerOperator
79
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
810

911
from airflow.providers.amazon.aws.sensors.glue import GlueJobSensor
@@ -54,20 +56,6 @@ def upload_rendered_script_to_s3(
5456
day = "{{ (data_interval_end - macros.timedelta(hours=1)).in_timezone('Asia/Seoul').day }}"
5557
hour = "{{ (data_interval_end - macros.timedelta(hours=1)).in_timezone('Asia/Seoul').hour }}" # before 1 hour
5658

57-
# upload_script = PythonOperator(
58-
# task_id="upload_script_to_s3",
59-
# python_callable=upload_rendered_script_to_s3,
60-
# op_kwargs={
61-
# "bucket_name": bucket_name,
62-
# "aws_conn_id": "aws_conn_id",
63-
# "template_s3_key": "source/script/glue_game_ccu_template.py",
64-
# "rendered_s3_key": "source/script/glue_game_ccu_script.py",
65-
# # into template
66-
# "input_path": f"s3://de-2-1-bucket/source/json/table_name=raw_game_ccu/year={year}/month={month}/day={day}/hour={hour}/",
67-
# "output_path": f"s3://de-2-1-bucket/source/parquet/table_name=raw_game_ccu/year={year}/month={month}/day={day}/hour={hour}/",
68-
# },
69-
# )
70-
7159
run_glue_job = GlueJobOperator(
7260
task_id="run_glue_job",
7361
job_name="de-2-1_game_ccu",
@@ -89,6 +77,24 @@ def upload_rendered_script_to_s3(
8977
run_id=run_glue_job.output,
9078
aws_conn_id="aws_conn_id",
9179
)
80+
glue_crawler_arn = Variable.get("glue_crawler_arn_secret")
81+
glue_crawler_config = {
82+
"Name": "de-2-1-raw_game_ccu",
83+
"Role": glue_crawler_arn,
84+
"DatabaseName": "de_2_1_glue",
85+
"Targets": {
86+
"S3Targets": [
87+
{
88+
"Path": "s3://de-2-1-bucket/source/parquet/table_name=raw_live_viewer/"
89+
}
90+
]
91+
},
92+
}
9293

94+
crawl_s3 = GlueCrawlerOperator(
95+
task_id="crawl_s3",
96+
config=glue_crawler_config,
97+
aws_conn_id="aws_conn_id",
98+
)
9399
# upload_script >> run_glue_job >> wait_for_job
94-
run_glue_job >> wait_for_job
100+
run_glue_job >> wait_for_job >> crawl_s3

tests/dags/test_dag_integrity.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import glob
44
import os
55
import pytest
6+
from unittest.mock import patch
67

78
# from airflow.models import DAG
89
from airflow.models.dagbag import DagBag
@@ -21,11 +22,10 @@ def test_dag_integrity(dag_file):
2122
Args:
2223
dag_file (str): The path to a DAG file to be tested.
2324
"""
24-
dag_bag = DagBag(dag_folder=os.path.dirname(dag_file), include_examples=False)
25+
with patch("airflow.models.Variable.get", return_value="dummy_value"):
26+
dag_bag = DagBag(dag_folder=os.path.dirname(dag_file), include_examples=False)
27+
dag_bag.process_file(dag_file, only_if_updated=True)
2528

26-
dag_bag.process_file(dag_file, only_if_updated=True)
27-
28-
# dag_id, dag in dag_bag.dags.items()
29-
for dag_id, _ in dag_bag.dags.items():
30-
assert dag_id in dag_bag.dags, "DAG ID not found in dag_bag.dags"
31-
assert not dag_bag.import_errors, "Import errors found in DagBag"
29+
for dag_id, _ in dag_bag.dags.items():
30+
assert dag_id in dag_bag.dags, "DAG ID not found in dag_bag.dags"
31+
assert not dag_bag.import_errors, "Import errors found in DagBag"

0 commit comments

Comments
 (0)