-
Notifications
You must be signed in to change notification settings - Fork 19
/
apple_health_xml_convert.py
154 lines (119 loc) · 4.74 KB
/
apple_health_xml_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Simple Apple Health XML to CSV
==============================
:File: convert.py
:Description: Convert Apple Health "export.xml" file into a csv
:Version: 0.0.2
:Created: 2019-10-04
:Updated: 2023-10-29
:Authors: Jason Meno (jam)
:Dependencies: An export.xml file from Apple Health
:License: BSD-2-Clause
"""
# %% Imports
import os
import pandas as pd
import xml.etree.ElementTree as ET
import datetime as dt
import sys
# %% Function Definitions
def preprocess_to_temp_file(file_path):
"""
The export.xml file is where all your data is, but Apple Health Export has
two main problems that make it difficult to parse:
1. The DTD markup syntax is exported incorrectly by Apple Health for some data types.
2. The invisible character \x0b (sometimes rendered as U+000b) likes to destroy trees. Think of the trees!
Knowing this, we can save the trees and pre-processes the XML data to avoid destruction and ParseErrors.
"""
print("Pre-processing and writing to temporary file...", end="")
sys.stdout.flush()
temp_file_path = "temp_preprocessed_export.xml"
with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
skip_dtd = False
for line in infile:
if '<!DOCTYPE' in line:
skip_dtd = True
if not skip_dtd:
line = strip_invisible_character(line)
outfile.write(line)
if ']>' in line:
skip_dtd = False
print("done!")
return temp_file_path
def strip_invisible_character(line):
return line.replace("\x0b", "")
def xml_to_csv(file_path):
"""Loops through the element tree, retrieving all objects, and then
combining them together into a dataframe
"""
print("Converting XML File to CSV...", end="")
sys.stdout.flush()
attribute_list = []
for event, elem in ET.iterparse(file_path, events=('end',)):
if event == 'end':
child_attrib = elem.attrib
for metadata_entry in list(elem):
metadata_values = list(metadata_entry.attrib.values())
if len(metadata_values) == 2:
metadata_dict = {metadata_values[0]: metadata_values[1]}
child_attrib.update(metadata_dict)
attribute_list.append(child_attrib)
# Clear the element from memory to avoid excessive memory consumption
elem.clear()
health_df = pd.DataFrame(attribute_list)
# Every health data type and some columns have a long identifer
# Removing these for readability
health_df.type = health_df.type.str.replace('HKQuantityTypeIdentifier', "")
health_df.type = health_df.type.str.replace('HKCategoryTypeIdentifier', "")
health_df.columns = \
health_df.columns.str.replace("HKCharacteristicTypeIdentifier", "")
# Reorder some of the columns for easier visual data review
original_cols = list(health_df)
shifted_cols = ['type',
'sourceName',
'value',
'unit',
'startDate',
'endDate',
'creationDate']
# Add loop specific column ordering if metadata entries exist
if 'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate' in original_cols:
shifted_cols.append(
'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate')
if 'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate' in original_cols:
shifted_cols.append(
'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate')
if 'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes' in original_cols:
shifted_cols.append(
'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes')
remaining_cols = list(set(original_cols) - set(shifted_cols))
reordered_cols = shifted_cols + remaining_cols
health_df = health_df.reindex(labels=reordered_cols, axis='columns')
# Sort by newest data first
health_df.sort_values(by='startDate', ascending=False, inplace=True)
print("done!")
return health_df
def save_to_csv(health_df):
print("Saving CSV file...", end="")
sys.stdout.flush()
today = dt.datetime.now().strftime('%Y-%m-%d')
health_df.to_csv("apple_health_export_" + today + ".csv", index=False)
print("done!")
return
def remove_temp_file(temp_file_path):
print("Removing temporary file...", end="")
os.remove(temp_file_path)
print("done!")
return
def main():
file_path = "export.xml"
temp_file_path = preprocess_to_temp_file(file_path)
health_df = xml_to_csv(temp_file_path)
save_to_csv(health_df)
remove_temp_file(temp_file_path)
return
# %%
if __name__ == '__main__':
main()