Skip to content

Commit c118349

Browse files
dmsuehirblbarker
authored andcommitted
Add 'date format' parameter for import_csv (#237)
1 parent 521b163 commit c118349

File tree

5 files changed

+39
-7
lines changed

5 files changed

+39
-7
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2015-01-03Z,01-02-2015 11:30 Z
2+
2015-04-12Z,04-12-2015 04:25 Z

integration-tests/tests/test_frame_import_csv.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,29 @@ def test_import_csv_with_duplicate_coluns(tc):
174174
# Try to create a frame from csv, using a schema that has duplicate column names
175175
tc.frame.import_csv(path, schema=schema, header=True, infer_schema=False)
176176
except Exception as e:
177-
assert("schema has duplicate column names: ['numeric']" in str(e))
177+
assert("schema has duplicate column names: ['numeric']" in str(e))
178+
179+
def test_import_csv_datetime_format(tc):
180+
path = "../datasets/datetimes.csv"
181+
182+
# Load with the date format that matches column a
183+
f = tc.frame.import_csv(path, schema=[("a",dtypes.datetime),("b",str)], datetime_format="yyyy-MM-ddX")
184+
185+
expected = ["2015-01-03T00:00:00.000000Z","2015-04-12T00:00:00.000000Z"]
186+
actual_data = f.take(f.count())
187+
188+
for row, expected_str in zip(actual_data, expected):
189+
assert(isinstance(row[0], long)) # 'a' datetime column should be a long (number of ms since epoch)
190+
assert(dtypes.ms_to_datetime_str(row[0]) == expected_str)
191+
assert(isinstance(row[1], basestring)) # column 'b' should be a str
192+
193+
# Load with the date format that matches column b
194+
f = tc.frame.import_csv(path, schema=[("a",str),("b",dtypes.datetime)], datetime_format="MM-dd-yyyy kk:mm X")
195+
196+
expected = ["2015-01-02T11:30:00.000000Z","2015-04-12T04:25:00.000000Z"]
197+
actual_data = f.take(f.count())
198+
199+
for row, expected_str in zip(actual_data, expected):
200+
assert(isinstance(row[0], basestring)) # column 'a' should be a str
201+
assert(isinstance(row[1], long)) # column 'b' should be a long (number of ms since epoch)
202+
assert(dtypes.ms_to_datetime_str(row[1]) == expected_str)

python/sparktk/dtypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def datetime_to_ms(date_time):
219219
if isinstance(date_time, datetime):
220220
ms = long(date_time.strftime("%s")) * 1000.0
221221
ms += date_time.microsecond // 1000
222-
return ms
222+
return long(ms)
223223
else:
224224
raise TypeError("Unable to calculate the number of milliseconds since epoch for type: %s" % type(date_time))
225225

python/sparktk/frame/constructors/import_csv.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from datetime import datetime
2323
from sparktk.frame import schema as sparktk_schema
2424

25-
def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, tc=TkContext.implicit):
25+
def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
2626
"""
2727
Creates a frame with data from a csv file.
2828
@@ -37,11 +37,13 @@ def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None
3737
and not be included in the data. The default value is false.
3838
:param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
3939
It requires one extra pass over the data and is false by default.
40-
:param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of
40+
:param schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of
4141
columns specified in the schema must match the number of columns in the csv file provided. If the
4242
value from the csv file cannot be converted to the data type specified by the schema (for example,
4343
if the csv file has a string, and the schema specifies an int), the value will show up as missing
4444
(None) in the frame.
45+
:param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
46+
specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
4547
:return: (Frame) Frame that contains the data from the csv file
4648
4749
Examples
@@ -115,7 +117,7 @@ def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None
115117
"com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
116118
delimiter=delimiter,
117119
header=header_str,
118-
dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
120+
dateformat=datetime_format,
119121
inferschema=infer_schema_str).load(path, schema=pyspark_schema)
120122

121123
df_schema = []

sparktk-core/src/main/scala/org/trustedanalytics/sparktk/frame/internal/constructors/Import.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,17 @@ object Import {
3535
* and not be included in the data. The default value is false.
3636
* @param inferSchema Boolean value indicating if the column types will be automatically inferred. It
3737
* requires one extra pass over the data and is false by default.
38+
* @param dateTimeFormat String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
39+
* specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
3840
* @return Frame with data from the csv file
3941
*/
4042
def importCsv(sc: SparkContext,
4143
path: String,
4244
delimiter: String = ",",
4345
header: Boolean = false,
4446
inferSchema: Boolean = false,
45-
schema: Option[Schema] = None): Frame = {
47+
schema: Option[Schema] = None,
48+
dateTimeFormat: String = "yyyy-MM-dd'T'HH:mm:ss.SSSX"): Frame = {
4649

4750
// If a custom schema is provided there's no reason to infer the schema during the load
4851
val loadWithInferSchema = if (schema.isDefined) false else inferSchema
@@ -57,7 +60,7 @@ object Import {
5760
.option("header", headerStr)
5861
.option("inferSchema", inferSchemaStr)
5962
.option("delimiter", delimiter)
60-
.option("dateFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSX")
63+
.option("dateFormat", dateTimeFormat)
6164

6265
if (!inferSchema && schema.isDefined) {
6366
dfr = dfr.schema(StructType(schema.get.columns.map(column =>

0 commit comments

Comments
 (0)