Skip to content

Commit

Permalink
Add mart_gtfs.fct_vehicle_locations_grouped (#3660)
Browse files Browse the repository at this point in the history
* get a vp grouping column

* get counts per location

* use st_equals for checking if location is the same

* rewrite so location passes groupby

* remove vp_group and add more rt identifiers

* add table to yaml

* rename ctes

* azimuth to direction

* get cardinal direction

* left join for direction

* move joins ahead so groupings are correct

* add some comments

* fill out docs, add tests

* rename table

* make incremental

* switch order/where

* remove dt, it's clustered by service_date
  • Loading branch information
tiffanychu90 authored Jan 28, 2025
1 parent 73aa28c commit eb42d3d
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 2 deletions.
47 changes: 45 additions & 2 deletions warehouse/models/mart/gtfs/_mart_gtfs_fcts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -705,15 +705,17 @@ models:
- &rt_position_speed
name: position_speed
description: '{{ doc("gtfs_position__speed") }}'
- name: location_timestamp
- &rt_location_timestamp
name: location_timestamp
description: Vehicle timestamp or header timestamp
- name: vehicle_trip_key
description: |
Composite of service_date, URL, vehicle_id, vehicle_label,
trip_id, and trip_start_time.
- name: next_location_key
description: Location key for the next vehicle timestamp.
- name: location
- &rt_location
name: location
description: GEOGPOINT created by the position latitute and longitude
- *trip_instance_key

Expand Down Expand Up @@ -2511,3 +2513,44 @@ models:
description: |
Total scheduled service hours that occurred for the route for this
month, `day_type`, and `time_of_day`.
- name: fct_vehicle_locations_grouped
description: |
Vehicle positions, grouped by location position. Uses fct_vehicle_locations and
calculates a first timestamp at location (location_timestamp) and
last timestamp at location (moving_timestamp).
Unique at the url/vehicle/trip/location_position level.
columns:
- name: key
description: |
Synthetic primary key constructed from `service_date`, `base64_url`,
`location_timestamp`, `vehicle_id`, `vehicle_label`,
`trip_id`, and `trip_start_time`.
tests: *almost_unique_rt_key_tests
- *gtfs_rt_dt
- *rt_service_date
- name: gtfs_dataset_key
description: *gtfs_dataset_key_desc
tests:
- dbt_utils.relationships_where:
to: ref('dim_gtfs_datasets')
field: key
to_condition: "type = 'vehicle_positions'"
- *base64_url
- *gtfs_rt_name
- *gtfs_rt_schedule_dataset_key
- <<: *trip_instance_key
tests:
- not_null
- unique_proportion:
at_least: 0.9999
- relationships:
to: ref('fct_observed_trips')
field: trip_instance_key
- *rt_location_timestamp
- name: moving_timestamp
description: |
The last location_timestamp at this position.
- name: n_vp
description: |
Number of vehicle positions observed at this location.
- *rt_location
133 changes: 133 additions & 0 deletions warehouse/models/mart/gtfs/fct_vehicle_locations_grouped.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{{
config(
materialized='incremental',
incremental_strategy='insert_overwrite',
partition_by = {
'field': 'service_date',
'data_type': 'date',
'granularity': 'day',
},
cluster_by=['service_date', 'base64_url'],
on_schema_change='append_new_columns'
)
}}

WITH fct_vehicle_locations AS (
SELECT
key,
gtfs_dataset_key,
base64_url,
gtfs_dataset_name,
schedule_gtfs_dataset_key,
service_date,
trip_instance_key,
location_timestamp,
location,
next_location_key,
FROM {{ ref('fct_vehicle_locations') }}
WHERE {{ incremental_where(default_start_var='PROD_GTFS_RT_START') }}
ORDER by service_date, trip_instance_key, location_timestamp
),


next_location AS (
SELECT
key AS next_location_key,
location AS next_location,
FROM fct_vehicle_locations
),

same_locations AS (
SELECT
fct_vehicle_locations.key,
fct_vehicle_locations.next_location_key,
ST_X(fct_vehicle_locations.location) AS lon,
ST_Y(fct_vehicle_locations.location) AS lat,
ST_X(next_location.next_location) - ST_X(fct_vehicle_locations.location) AS delta_lon,
ST_Y(next_location.next_location) - ST_Y(fct_vehicle_locations.location) AS delta_lat,
CASE
WHEN ST_EQUALS(fct_vehicle_locations.location, next_location.next_location)
THEN 0
ELSE 1
END AS new_group,
FROM fct_vehicle_locations
INNER JOIN next_location
ON fct_vehicle_locations.next_location_key = next_location.next_location_key
),

direction AS (
SELECT
same_locations.next_location_key AS key,
same_locations.new_group,
CASE
WHEN (ABS(delta_lon) > ABS(delta_lat)) AND (delta_lon > 0)
THEN "East"
WHEN (ABS(delta_lon) > ABS(delta_lat)) AND (delta_lon < 0)
THEN "West"
WHEN (ABS(delta_lon) < ABS(delta_lat)) AND (delta_lat > 0)
THEN "North"
WHEN (ABS(delta_lon) < ABS(delta_lat)) AND (delta_lat < 0)
THEN "South"
END AS vp_direction,
FROM same_locations
WHERE same_locations.new_group = 1
-- subset to where new_group is identified so we can fill in unknown
-- direction / dwelling points once we group the vp
),

keys_grouped AS (
SELECT
fct_vehicle_locations.key,
direction.new_group,
direction.vp_direction
FROM fct_vehicle_locations
LEFT JOIN direction
ON fct_vehicle_locations.key = direction.key
),

vp_grouper AS (
SELECT
fct_vehicle_locations.key,
fct_vehicle_locations.gtfs_dataset_key,
fct_vehicle_locations.base64_url,
fct_vehicle_locations.gtfs_dataset_name,
fct_vehicle_locations.schedule_gtfs_dataset_key,
fct_vehicle_locations.service_date,
fct_vehicle_locations.trip_instance_key,
fct_vehicle_locations.location,
fct_vehicle_locations.location_timestamp,
SUM(keys_grouped.new_group)
OVER (
PARTITION BY service_date, trip_instance_key
ORDER BY location_timestamp
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS vp_group,
keys_grouped.vp_direction
FROM fct_vehicle_locations
INNER JOIN keys_grouped
ON fct_vehicle_locations.key = keys_grouped.key
),

fct_grouped_locations AS (
SELECT
MIN(vp_grouper.key) AS key,
vp_grouper.gtfs_dataset_key,
vp_grouper.base64_url,
vp_grouper.gtfs_dataset_name,
vp_grouper.schedule_gtfs_dataset_key,
vp_grouper.service_date,
vp_grouper.trip_instance_key,
MIN(vp_grouper.location_timestamp) AS location_timestamp,
MAX(vp_grouper.location_timestamp) AS moving_timestamp,
COUNT(*) AS n_vp,
ST_GEOGFROMTEXT(MIN(ST_ASTEXT(vp_grouper.location))) AS location,
CASE
WHEN MIN(vp_grouper.vp_direction) IS NULL
THEN "Unknown" -- now that we grabbed a valid direction, any remaining should be unknown
ELSE MIN(vp_grouper.vp_direction)
END AS vp_direction,
FROM vp_grouper
GROUP BY gtfs_dataset_key, base64_url, gtfs_dataset_name, schedule_gtfs_dataset_key, service_date, trip_instance_key, vp_group
)

SELECT * FROM fct_grouped_locations

0 comments on commit eb42d3d

Please sign in to comment.