-
Notifications
You must be signed in to change notification settings - Fork 57
/
set_based.yaml
126 lines (119 loc) · 2.67 KB
/
set_based.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
entities:
TEST_DATA:
source_database: BIGQUERY
table_name: <table_id>
dataset_name: <dataset_id>
project_name: <project_id>
columns:
UNIQUE_KEY:
name: unique_key
data_type: INTEGER
DESCRIPT:
name: descript
data_type: STRING
DATE:
name: date
data_type: DATE
TIME:
name: time
data_type: TIME
ADDRESS:
name: address
data_type: STRING
LON:
name: longitude
data_type: FLOAT
LAT:
name: latitude
data_type: FLOAT
LOCATION:
name: location
data_type: STRING
TIMESTAMP:
name: timestamp
data_type: TIMESTAMP
TAXI_TRIPS:
source_database: BIGQUERY
table_name: <table_id>
dataset_name: <dataset_id>
project_name: <project_id>
columns:
UNIQUE_KEY:
name: unique_key
data_type: STRING
TAXI_ID:
name: taxi_id
data_type: STRING
TRIP_START_TIMESTAMP:
name: trip_start_timestamp
data_type: TIMESTAMP
TRIP_END_TIMESTAMP:
name: trip_end_timestamp
data_type: TIMESTAMP
PAYMENT_TYPE:
name: payment_type
data_type: STRING
FARE:
name: fare
data_type: FLOAT
TIPS:
name: tips
data_type: FLOAT
TOLLS:
name: tolls
data_type: FLOAT
EXTRAS:
name: extras
data_type: FLOAT
TRIP_TOTAL:
name: trip_total
data_type: FLOAT
PICKUP_LOCATION:
name: pickup_location
data_type: STRING
DROPOFF_LOCATION:
name: dropoff_location
data_type: STRING
rules:
DISTINCT_VALUES:
rule_type: CUSTOM_SQL_STATEMENT
custom_sql_arguments:
n_max
params:
custom_sql_statement: |-
select count(distinct $column) as n
from data
having n > $n_max
Z_SCORE_OUTLIER:
rule_type: CUSTOM_SQL_STATEMENT
custom_sql_arguments:
z_limit
params:
custom_sql_statement: |-
with stats as (
select avg($column) as mu, stddev($column) as sigma
from data
),
z_scores as (
select $column, mu, sigma, abs(mu - $column)/sigma as z
from data
join stats on true
)
select *
from z_scores
where z > $z_limit
rule_bindings:
DISTINCT_VALUES:
entity_id: TEST_DATA
column_id: UNIQUE_KEY
row_filter_id: NONE
rule_ids:
- DISTINCT_VALUES:
n_max: 1000
OUTLIER_DETECTION:
entity_id: TAXI_TRIPS
column_id: TIPS
row_filter_id: NONE
rule_ids:
- Z_SCORE_OUTLIER:
z_limit: 3