diff --git a/src/config/config_boston.yml b/src/config/config_boston.yml index 82ad9565..c37bf466 100644 --- a/src/config/config_boston.yml +++ b/src/config/config_boston.yml @@ -34,9 +34,23 @@ crashes_files: id: ID latitude: lat longitude: long - date: dispatch_ts + # If date supplied in single column: + date_complete: dispatch_ts + # If date is separated into year/month/day: + date_year: + date_month: + # Leave date_day empty if not available + date_day: + # If time is available and separate from date: + time: + # If time specified, time_format is one of: + # default (HH:MM:SS) + # seconds (since midnight) + # military (HHMM) + time_format: optional: summary: location_type + address: vehicles: mode_type bikes: mode_type diff --git a/src/config/config_brisbane.yml b/src/config/config_brisbane.yml new file mode 100644 index 00000000..4919ea85 --- /dev/null +++ b/src/config/config_brisbane.yml @@ -0,0 +1,51 @@ +# City name +city: brisbane +# City centerpoint latitude & longitude (default geocoded values set) +city_latitude: -27.4697707 +city_longitude: 153.0251235 +# Radius of city's road network from centerpoint in km, required if OSM has no polygon data (defaults to 20km) +city_radius: 20 +# The folder under data where this city's data is stored +name: brisbane +# If given, limit crashes to after start_year and before end_year +# Recommended to limit to just a few years for now +start_year: +end_year: + + +################################################################# +# Configuration for data standardization + +# crash file configurations +crashes_files: + locations_2014_2017.csv: + required: + id: Crash_Ref_Number + latitude: Crash_Latitude_GDA94 + longitude: Crash_Longitude_GDA94 + # If date supplied in single column: + date_complete: Crash_Date + # If date is separated into year/month/day: + date_year: + date_month: + # Leave date_day empty if not available + date_day: + # If time is available and separate from date: + time: + # If time specified, time_format is one of: + # default (HH:MM:SS) + # seconds (since midnight) + # military (HHMM) + time_format: + optional: + summary: Crash_DCA_Description + address: Crash_Street + vehicles: + bikes: + +# week on which to predict crashes (week, year) +# Best practice is to choose a week towards the end of your crash data set +# in format [month, year] +time_target: [30, 2017] +# specify how many weeks back to predict in output of train_model +weeks_back: 1 \ No newline at end of file diff --git a/src/config/config_cambridge.yml b/src/config/config_cambridge.yml index 1326849e..20346e76 100644 --- a/src/config/config_cambridge.yml +++ b/src/config/config_cambridge.yml @@ -29,12 +29,25 @@ crashes_files: id: ID latitude: Y longitude: X - date: Date Time + # If date supplied in single column: + date_complete: Date Time + # If date is separated into year/month/day: + date_year: + date_month: + # Leave date_day empty if not available + date_day: + # If time is available and separate from date: + time: + # If time specified, time_format is one of: + # default (HH:MM:SS) + # seconds (since midnight) + # military (HHMM) + time_format: optional: summary: V1 First Event - vehicles: - bikes: address: Location + vehicles: + bikes: # concern column name diff --git a/src/config/config_dc.yml b/src/config/config_dc.yml index a97fbdf2..664a4793 100644 --- a/src/config/config_dc.yml +++ b/src/config/config_dc.yml @@ -23,7 +23,20 @@ crashes_files: id: OBJECTID latitude: Y longitude: X - date: REPORTDATE + # If date supplied in single column: + date_complete: REPORTDATE + # If date is separated into year/month/day: + date_year: + date_month: + # Leave date_day empty if not available + date_day: + # If time is available and separate from date: + time: + # If time specified, time_format is one of: + # default (HH:MM:SS) + # seconds (since midnight) + # military (HHMM) + time_format: optional: summary: MAR_ADDRESS vehicles: TOTAL_VEHICLES diff --git a/src/config/config_pittsburgh.yml b/src/config/config_pittsburgh.yml new file mode 100644 index 00000000..63996756 --- /dev/null +++ b/src/config/config_pittsburgh.yml @@ -0,0 +1,49 @@ +# City name +city: Pittsburgh, PA, USA +# City centerpoint latitude & longitude (default geocoded values set) +city_latitude: 40.44062479999999 +city_longitude: -79.9958864 +# Radius of city's road network from centerpoint in km, required if OSM has no polygon data (defaults to 20km) +city_radius: 20 +# The folder under data where this city's data is stored +name: pittsburgh +# If given, limit crashes to after start_year and before end_year +# Recommended to limit to just a few years for now +start_year: +end_year: + + +################################################################# +# Configuration for data standardization + +# crash file configurations +crashes_files: + pittsburgh_2017.csv: + required: + id: _id + latitude: DEC_LAT + longitude: DEC_LONG + # If date supplied in single column: + date_complete: + # If date is separated into year/month/day: + date_year: CRASH_YEAR + date_month: CRASH_MONTH + # Leave date_day empty if not available + date_day: + # If time is available and separate from date: + time: TIME_OF_DAY + # If time specified, time_format is one of: + # default (HH:MM:SS) + # seconds (since midnight) + # military (HHMM) + time_format: military + optional: + summary: + address: + +# week on which to predict crashes (week, year) +# Best practice is to choose a week towards the end of your crash data set +# in format [month, year] +time_target: [30, 2017] +# specify how many weeks back to predict in output of train_model +weeks_back: 1 diff --git a/src/data/osm_create_maps.py b/src/data/osm_create_maps.py index 31bf2e8a..d4da0ca4 100644 --- a/src/data/osm_create_maps.py +++ b/src/data/osm_create_maps.py @@ -345,6 +345,22 @@ def get_width(width): return width +def get_speed(speed): + """ + Parse the speed from the openstreetmap maxspeed property field + If there's more than one speed (from merged ways), use the highest speed + Args: + speed - a string + Returns: + speed - an int + """ + if speed: + speeds = [int(x) for x in re.findall('\d+', speed)] + if speeds: + return max(speeds) + return 0 + + def clean_ways(orig_file, DOC_FP): """ Reads in osm_ways file, cleans up the features, and reprojects @@ -371,17 +387,8 @@ def clean_ways(orig_file, DOC_FP): results = [] for way_line in way_lines: - # All features need to be ints, so convert them here - - # Use speed limit if given in osm - speed = way_line['properties']['maxspeed'] - if speed: - s = re.search('[0-9]+', speed) - if s: - speed = s.group(0) - if not speed: - speed = 0 - + speed = get_speed(way_line['properties']['maxspeed']) \ + if 'maxspeed' in list(way_line['properties']) else 0 width = get_width(way_line['properties']['width']) \ if 'width' in list(way_line['properties']) else 0 diff --git a/src/data/tests/test_initialize_city.py b/src/data/tests/test_initialize_city.py index b5cbd84f..66a66c54 100644 --- a/src/data/tests/test_initialize_city.py +++ b/src/data/tests/test_initialize_city.py @@ -45,12 +45,25 @@ def mockreturn(address): id: latitude: longitude: - date: - # Time is only required if date and time are in different columns + # If date supplied in single column: + date_complete: + # If date is separated into year/month/day: + date_year: + date_month: + # Leave date_day empty if not available + date_day: + # If time is available and separate from date: time: + # If time specified, time_format is one of: + # default (HH:MM:SS) + # seconds (since midnight) + # military (HHMM) + time_format: optional: summary: address: + vehicles: + bikes: # List of concern type information concern_files: diff --git a/src/data/tests/test_osm_create_maps.py b/src/data/tests/test_osm_create_maps.py index 3767d2d0..20052b62 100644 --- a/src/data/tests/test_osm_create_maps.py +++ b/src/data/tests/test_osm_create_maps.py @@ -14,6 +14,13 @@ def test_get_width(): assert osm_create_maps.get_width('t') == 0 +def test_get_speed(): + assert osm_create_maps.get_speed('') == 0 + assert osm_create_maps.get_speed('signals') == 0 + assert osm_create_maps.get_speed('60') == 60 + assert osm_create_maps.get_speed("['90', '100']") == 100 + + def test_reproject_and_clean_feats(tmpdir): tmppath = tmpdir.strpath diff --git a/src/data_standardization/standardization_util.py b/src/data_standardization/standardization_util.py index c98ff88f..7f3ffa7a 100644 --- a/src/data_standardization/standardization_util.py +++ b/src/data_standardization/standardization_util.py @@ -1,11 +1,10 @@ import dateutil.parser as date_parser -import re -from datetime import timedelta +from datetime import datetime, timedelta import json from jsonschema import validate -def parse_date(date, time=None): +def parse_date(date, time=None, time_format=None): """ Turn a date (and optional time) into a datetime string in standardized format @@ -21,22 +20,36 @@ def parse_date(date, time=None): # If there's no time in the date given, look at the time field # if available - if date.hour == 0 and date.minute == 0 and date.second == 0 \ - and time: - - # special case of seconds past midnight - if re.match(r"^\d+$", str(time)) and int(time) >= 0 \ - and int(time) < 86400: + if date.hour == 0 and date.minute == 0 and date.second == 0 and time: + + if time_format == "military": + # military times less than 4 chars require padding with leading zeros + # e.g 155 becomes 0155 + while (len(str(time)) < 4): + time = "0" + str(time) + + # ignore invalid times + if int(time) <= 2359: + date = date_parser.parse( + date.strftime('%Y-%m-%d ') + datetime.strptime(str(time), '%H%M').strftime('%I:%M%p').lower() + ) + + else: + date = date_parser.parse( + date.strftime('%Y-%m-%d ') + ) + + elif time_format == "seconds": date = date + timedelta(seconds=int(time)) - + else: date = date_parser.parse( date.strftime('%Y-%m-%d ') + str(time) ) - + # TODO add timezone to config ("Z" is UTC) date_time = date.strftime("%Y-%m-%dT%H:%M:%SZ") - + return date_time diff --git a/src/data_standardization/standardize_crashes.py b/src/data_standardization/standardize_crashes.py index a720dfa7..c3b099ff 100644 --- a/src/data_standardization/standardize_crashes.py +++ b/src/data_standardization/standardize_crashes.py @@ -7,6 +7,8 @@ import yaml from collections import OrderedDict import csv +import calendar +import random from .standardization_util import parse_date, validate_and_write_schema CURR_FP = os.path.dirname( @@ -21,25 +23,56 @@ def read_standardized_fields(raw_crashes, fields, opt_fields): for i, crash in enumerate(raw_crashes): if i % 10000 == 0: print(i) - # skip any crashes that don't have coordinates or date - if crash[fields["latitude"]] == "" or crash[fields["longitude"]] == "" \ - or crash[fields['date']] == "": + + # skip any crashes that don't have coordinates + if crash[fields["latitude"]] == "" or crash[fields["longitude"]] == "": + continue + + # construct crash date based on config settings, skipping any crashes without date + if fields["date_complete"]: + if not crash[fields["date_complete"]]: + continue + + else: + crash_date = crash[fields["date_complete"]] + + elif fields["date_year"] and fields["date_month"]: + if fields["date_day"]: + crash_date = str(crash[fields["date_year"]]) + "-" + str(crash[fields["date_month"]]) + "-" + crash[fields["date_day"]] + # some cities do not supply a day of month for crashes, randomize if so + else: + available_dates = calendar.Calendar().itermonthdates( + crash[fields["date_year"]], crash[fields["date_month"]]) + crash_date = str(random.choice([date for date in available_dates if date.month == crash[fields["date_month"]]])) + + # skip any crashes that don't have a date + else: continue - time = None - if 'time' in fields and fields['time']: - time = crash[fields['time']] - date_time = parse_date( - crash[fields['date']], - time=time - ) + crash_time = None + if fields["time"]: + crash_time = crash[fields["time"]] + + if fields["time_format"]: + crash_date_time = parse_date( + crash_date, + crash_time, + fields["time_format"] + ) + + else: + crash_date_time = parse_date( + crash_date, + crash_time + ) + # Skip crashes where date can't be parsed - if not date_time: + if not crash_date_time: continue formatted_crash = OrderedDict([ ("id", crash[fields["id"]]), - ("dateOccurred", date_time), + ("dateOccurred", crash_date_time), ("location", OrderedDict([ ("latitude", float(crash[fields["latitude"]])), ("longitude", float(crash[fields["longitude"]])) @@ -169,4 +202,3 @@ def add_id(csv_file, id_field): list_city_crashes = list(dict_city_crashes.values()) crashes_output = os.path.join(args.folder, "standardized/crashes.json") validate_and_write_schema(schema_path, list_city_crashes, crashes_output) - diff --git a/src/data_standardization/tests/test_standardization_util.py b/src/data_standardization/tests/test_standardization_util.py index e5fdfd1a..34b1e87b 100644 --- a/src/data_standardization/tests/test_standardization_util.py +++ b/src/data_standardization/tests/test_standardization_util.py @@ -12,12 +12,21 @@ def test_parse_date(): assert standardization_util.parse_date('01/08/2009', time='08:53:00 PM') \ == '2009-01-08T20:53:00Z' - assert standardization_util.parse_date('01/08/2009', time='75180') \ + assert standardization_util.parse_date('01/08/2009', time='75180', time_format='seconds') \ == '2009-01-08T20:53:00Z' assert standardization_util.parse_date('01/08/2009 unk') \ is None + assert standardization_util.parse_date('01/08/2009', time='0201', time_format='military') \ + == '2009-01-08T02:01:00Z' + + assert standardization_util.parse_date('01/08/2009', time='1201', time_format='military') \ + == '2009-01-08T12:01:00Z' + + assert standardization_util.parse_date('01/08/2009', time='9999', time_format='military') \ + == '2009-01-08T00:00:00Z' + def test_parse_address(): address = "29 OXFORD ST\n" + \ diff --git a/src/data_standardization/tests/test_standardize_crashes.py b/src/data_standardization/tests/test_standardize_crashes.py index a8fff8bc..6a94fc0b 100644 --- a/src/data_standardization/tests/test_standardize_crashes.py +++ b/src/data_standardization/tests/test_standardize_crashes.py @@ -86,3 +86,94 @@ def test_numeric_and_string_ids(): os.path.dirname( os.path.dirname( os.path.abspath(__file__))))), "standards", "crashes-schema.json")))) + +def test_date_formats(): + """ + Test various combinations of supplying dates. + """ + + fields_date_constructed = { + "id": "id", + "date_complete": "date_of_crash", + "time": "", + "time_format": "", + "latitude": "lat", + "longitude": "lng" + } + + # Confirm crashes without coordinates are skipped + crashes_no_coords = [{ + "id": "A1B2C3D4E5", + "date_of_crash": "2016-01-01T02:30:23-05:00", + "lat": "", + "lng": "" + }] + + assert len(standardize_crashes.read_standardized_fields(crashes_no_coords, fields_date_constructed, {})) == 0 + + # Confirm crashes using date_complete but without a value are skipped + crashes_no_date = [{ + "id": "A1B2C3D4E5", + "date_of_crash": "", + "lat": 42.317987926802246, + "lng": -71.06188127008645 + }] + + assert len(standardize_crashes.read_standardized_fields(crashes_no_date, fields_date_constructed, {})) == 0 + + # Confirm crashes using date_complete with a value are standardized + crashes_with_date = [{ + "id": "A1B2C3D4E5", + "date_of_crash": "2016-01-01T02:30:23-05:00", + "lat": 42.317987926802246, + "lng": -71.06188127008645 + }] + + assert len(standardize_crashes.read_standardized_fields(crashes_with_date, fields_date_constructed, {})) == 1 + + # Confirm crashes using deconstructed date with all values are standardized + fields_date_deconstructed = { + "id": "id", + "date_complete": "", + "date_year": "year_of_crash", + "date_month": "month_of_crash", + "date_day": "day_of_crash", + "time": "", + "time_format": "", + "latitude": "lat", + "longitude": "lng" + } + + crashes_with_date = [{ + "id": "A1B2C3D4E5", + "year_of_crash": "2016", + "month_of_crash": "01", + "day_of_crash": "01", + "lat": 42.317987926802246, + "lng": -71.06188127008645 + }] + + assert len(standardize_crashes.read_standardized_fields(crashes_with_date, fields_date_deconstructed, {})) == 1 + + # Confirm crashes using deconstructed date but missing a day are standardized with a random day + fields_date_no_day = { + "id": "id", + "date_complete": "", + "date_year": "year_of_crash", + "date_month": "month_of_crash", + "date_day": "", + "time": "", + "time_format": "", + "latitude": "lat", + "longitude": "lng" + } + + crashes_with_date = [{ + "id": "A1B2C3D4E5", + "year_of_crash": 2017, + "month_of_crash": 1, + "lat": 42.317987926802246, + "lng": -71.06188127008645 + }] + + assert len(standardize_crashes.read_standardized_fields(crashes_with_date, fields_date_no_day, {})) == 1 diff --git a/src/initialize_city.py b/src/initialize_city.py index 5df865b3..34c5092e 100644 --- a/src/initialize_city.py +++ b/src/initialize_city.py @@ -36,13 +36,25 @@ def make_config_file(yml_file, city, folder, crash, concern, supplemental=[]): " id: \n" + " latitude: \n" + " longitude: \n" + - " date: \n" + - " # Time is only required if date and time" + - " are in different columns\n" + + " # If date supplied in single column:\n" + + " date_complete: \n" + + " # If date is separated into year/month/day:\n" + + " date_year: \n" + + " date_month: \n" + + " # Leave date_day empty if not available\n" + + " date_day: \n"+ + " # If time is available and separate from date:\n" + " time: \n" + + " # If time specified, time_format is one of:\n" + + " # default (HH:MM:SS)\n" + + " # seconds (since midnight)\n" + + " # military (HHMM)\n" + + " time_format: \n"+ " optional:\n" + " summary: \n" + - " address: \n\n" + " address: \n" + + " vehicles: \n" + + " bikes: \n\n" ) if concern: