Skip to content

Commit

Permalink
Merge pull request #181 from Data4Democracy/crash_import
Browse files Browse the repository at this point in the history
Crash import
  • Loading branch information
j-t-t authored Sep 4, 2018
2 parents 373d542 + c5374e9 commit 03340d5
Show file tree
Hide file tree
Showing 13 changed files with 372 additions and 48 deletions.
16 changes: 15 additions & 1 deletion src/config/config_boston.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,23 @@ crashes_files:
id: ID
latitude: lat
longitude: long
date: dispatch_ts
# If date supplied in single column:
date_complete: dispatch_ts
# If date is separated into year/month/day:
date_year:
date_month:
# Leave date_day empty if not available
date_day:
# If time is available and separate from date:
time:
# If time specified, time_format is one of:
# default (HH:MM:SS)
# seconds (since midnight)
# military (HHMM)
time_format:
optional:
summary: location_type
address:
vehicles: mode_type
bikes: mode_type

Expand Down
51 changes: 51 additions & 0 deletions src/config/config_brisbane.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# City name
city: brisbane
# City centerpoint latitude & longitude (default geocoded values set)
city_latitude: -27.4697707
city_longitude: 153.0251235
# Radius of city's road network from centerpoint in km, required if OSM has no polygon data (defaults to 20km)
city_radius: 20
# The folder under data where this city's data is stored
name: brisbane
# If given, limit crashes to after start_year and before end_year
# Recommended to limit to just a few years for now
start_year:
end_year:


#################################################################
# Configuration for data standardization

# crash file configurations
crashes_files:
locations_2014_2017.csv:
required:
id: Crash_Ref_Number
latitude: Crash_Latitude_GDA94
longitude: Crash_Longitude_GDA94
# If date supplied in single column:
date_complete: Crash_Date
# If date is separated into year/month/day:
date_year:
date_month:
# Leave date_day empty if not available
date_day:
# If time is available and separate from date:
time:
# If time specified, time_format is one of:
# default (HH:MM:SS)
# seconds (since midnight)
# military (HHMM)
time_format:
optional:
summary: Crash_DCA_Description
address: Crash_Street
vehicles:
bikes:

# week on which to predict crashes (week, year)
# Best practice is to choose a week towards the end of your crash data set
# in format [month, year]
time_target: [30, 2017]
# specify how many weeks back to predict in output of train_model
weeks_back: 1
19 changes: 16 additions & 3 deletions src/config/config_cambridge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,25 @@ crashes_files:
id: ID
latitude: Y
longitude: X
date: Date Time
# If date supplied in single column:
date_complete: Date Time
# If date is separated into year/month/day:
date_year:
date_month:
# Leave date_day empty if not available
date_day:
# If time is available and separate from date:
time:
# If time specified, time_format is one of:
# default (HH:MM:SS)
# seconds (since midnight)
# military (HHMM)
time_format:
optional:
summary: V1 First Event
vehicles:
bikes:
address: Location
vehicles:
bikes:


# concern column name
Expand Down
15 changes: 14 additions & 1 deletion src/config/config_dc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,20 @@ crashes_files:
id: OBJECTID
latitude: Y
longitude: X
date: REPORTDATE
# If date supplied in single column:
date_complete: REPORTDATE
# If date is separated into year/month/day:
date_year:
date_month:
# Leave date_day empty if not available
date_day:
# If time is available and separate from date:
time:
# If time specified, time_format is one of:
# default (HH:MM:SS)
# seconds (since midnight)
# military (HHMM)
time_format:
optional:
summary: MAR_ADDRESS
vehicles: TOTAL_VEHICLES
Expand Down
49 changes: 49 additions & 0 deletions src/config/config_pittsburgh.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# City name
city: Pittsburgh, PA, USA
# City centerpoint latitude & longitude (default geocoded values set)
city_latitude: 40.44062479999999
city_longitude: -79.9958864
# Radius of city's road network from centerpoint in km, required if OSM has no polygon data (defaults to 20km)
city_radius: 20
# The folder under data where this city's data is stored
name: pittsburgh
# If given, limit crashes to after start_year and before end_year
# Recommended to limit to just a few years for now
start_year:
end_year:


#################################################################
# Configuration for data standardization

# crash file configurations
crashes_files:
pittsburgh_2017.csv:
required:
id: _id
latitude: DEC_LAT
longitude: DEC_LONG
# If date supplied in single column:
date_complete:
# If date is separated into year/month/day:
date_year: CRASH_YEAR
date_month: CRASH_MONTH
# Leave date_day empty if not available
date_day:
# If time is available and separate from date:
time: TIME_OF_DAY
# If time specified, time_format is one of:
# default (HH:MM:SS)
# seconds (since midnight)
# military (HHMM)
time_format: military
optional:
summary:
address:

# week on which to predict crashes (week, year)
# Best practice is to choose a week towards the end of your crash data set
# in format [month, year]
time_target: [30, 2017]
# specify how many weeks back to predict in output of train_model
weeks_back: 1
29 changes: 18 additions & 11 deletions src/data/osm_create_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,22 @@ def get_width(width):
return width


def get_speed(speed):
"""
Parse the speed from the openstreetmap maxspeed property field
If there's more than one speed (from merged ways), use the highest speed
Args:
speed - a string
Returns:
speed - an int
"""
if speed:
speeds = [int(x) for x in re.findall('\d+', speed)]
if speeds:
return max(speeds)
return 0


def clean_ways(orig_file, DOC_FP):
"""
Reads in osm_ways file, cleans up the features, and reprojects
Expand All @@ -371,17 +387,8 @@ def clean_ways(orig_file, DOC_FP):
results = []
for way_line in way_lines:

# All features need to be ints, so convert them here

# Use speed limit if given in osm
speed = way_line['properties']['maxspeed']
if speed:
s = re.search('[0-9]+', speed)
if s:
speed = s.group(0)
if not speed:
speed = 0

speed = get_speed(way_line['properties']['maxspeed']) \
if 'maxspeed' in list(way_line['properties']) else 0
width = get_width(way_line['properties']['width']) \
if 'width' in list(way_line['properties']) else 0

Expand Down
17 changes: 15 additions & 2 deletions src/data/tests/test_initialize_city.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,25 @@ def mockreturn(address):
id:
latitude:
longitude:
date:
# Time is only required if date and time are in different columns
# If date supplied in single column:
date_complete:
# If date is separated into year/month/day:
date_year:
date_month:
# Leave date_day empty if not available
date_day:
# If time is available and separate from date:
time:
# If time specified, time_format is one of:
# default (HH:MM:SS)
# seconds (since midnight)
# military (HHMM)
time_format:
optional:
summary:
address:
vehicles:
bikes:
# List of concern type information
concern_files:
Expand Down
7 changes: 7 additions & 0 deletions src/data/tests/test_osm_create_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ def test_get_width():
assert osm_create_maps.get_width('t') == 0


def test_get_speed():
assert osm_create_maps.get_speed('') == 0
assert osm_create_maps.get_speed('signals') == 0
assert osm_create_maps.get_speed('60') == 60
assert osm_create_maps.get_speed("['90', '100']") == 100


def test_reproject_and_clean_feats(tmpdir):

tmppath = tmpdir.strpath
Expand Down
37 changes: 25 additions & 12 deletions src/data_standardization/standardization_util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import dateutil.parser as date_parser
import re
from datetime import timedelta
from datetime import datetime, timedelta
import json
from jsonschema import validate


def parse_date(date, time=None):
def parse_date(date, time=None, time_format=None):
"""
Turn a date (and optional time) into a datetime string
in standardized format
Expand All @@ -21,22 +20,36 @@ def parse_date(date, time=None):

# If there's no time in the date given, look at the time field
# if available
if date.hour == 0 and date.minute == 0 and date.second == 0 \
and time:

# special case of seconds past midnight
if re.match(r"^\d+$", str(time)) and int(time) >= 0 \
and int(time) < 86400:
if date.hour == 0 and date.minute == 0 and date.second == 0 and time:

if time_format == "military":
# military times less than 4 chars require padding with leading zeros
# e.g 155 becomes 0155
while (len(str(time)) < 4):
time = "0" + str(time)

# ignore invalid times
if int(time) <= 2359:
date = date_parser.parse(
date.strftime('%Y-%m-%d ') + datetime.strptime(str(time), '%H%M').strftime('%I:%M%p').lower()
)

else:
date = date_parser.parse(
date.strftime('%Y-%m-%d ')
)

elif time_format == "seconds":
date = date + timedelta(seconds=int(time))

else:
date = date_parser.parse(
date.strftime('%Y-%m-%d ') + str(time)
)

# TODO add timezone to config ("Z" is UTC)
date_time = date.strftime("%Y-%m-%dT%H:%M:%SZ")

return date_time


Expand Down
Loading

0 comments on commit 03340d5

Please sign in to comment.