-
Notifications
You must be signed in to change notification settings - Fork 106
/
load_avro.py
60 lines (51 loc) · 1.95 KB
/
load_avro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Load Avro formated file into an SFrame
import graphlab as gl
import os
def load_sframe_from_avro(url):
"""
Load data from an Avro file into an SArray, and then transform it such that
each root-level field in the Avro schema corresponds to a column in your
SFrame.
@input filename Name of the file to be read.
@returns Output SFrame
"""
# download an Avro file from S3
gl.util.download_dataset(url, extract=False)
# load the downloaded Avro file into an SArray
my_avro_file = os.path.basename(url)
sf = gl.SArray.from_avro(my_avro_file).unpack()
# rename column names after unpack
sf.rename({cn: cn[2:] for cn in sf.column_names()})
"""
>>> sf.head(1)
Columns:
business_id str
date str
review_id str
stars int
text str
type str
user_id str
votes dict
Rows: 1
Data:
+------------------------+------------+------------------------+-------+
| business_id | date | review_id | stars |
+------------------------+------------+------------------------+-------+
| WIcDFpHEnC3ihNmS7-6-ZA | 2011-02-11 | 0ESSqLfOae77muWTv_zUqA | 3 |
+------------------------+------------+------------------------+-------+
+--------------------------------+--------+------------------------+
| text | type | user_id |
+--------------------------------+--------+------------------------+
| Lately i have been feeling ... | review | r-t7IiTSD0QZdt8lOUCqeQ |
+--------------------------------+--------+------------------------+
+--------------------------------+
| votes |
+--------------------------------+
| {'funny': 1, 'useful': 1, ... |
+--------------------------------+
[1 rows x 8 columns]
"""
return sf
avro_sf = load_sframe_from_avro(
'https://static.turi.com/datasets/how-to/reviews.avro')