From 777f142acf8510212f534693772821ceb37a49fa Mon Sep 17 00:00:00 2001
From: Jason Meno <jasonmeno@msn.com>
Date: Sat, 12 Nov 2022 15:53:35 -0800
Subject: [PATCH] refactor and fix ParseError

---
 apple_health_xml_convert.py | 61 ++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 18 deletions(-)
diff --git a/apple_health_xml_convert.py b/apple_health_xml_convert.py
index 8bfce2b..a229b8c 100644
--- a/apple_health_xml_convert.py
+++ b/apple_health_xml_convert.py
@@ -16,34 +16,53 @@
 import pandas as pd
 import xml.etree.ElementTree as ET
 import datetime as dt
+import re
+import sys
 
 
 # %% Function Definitions
-def pre_process():
-    """Pre-processes the XML file by replacing specific bits that would
-    normally result in a ParseError
+
+def pre_process(xml_string):
     """
+    The export.xml file is where all your data is, but Apple Health Export has
+    two main problems that make it difficult to parse: 
+        1. The DTD markup syntax is exported incorrectly by Apple Health for some data types.
+        2. The invisible character \x0b (sometimes rendered as U+000b) likes to destroy trees. Think of the trees!
 
-    print("Pre-processing...", end="")
-    with open("export.xml") as f:
-        newText = f.read().replace("\x0b", "")
+    Knowing this, we can save the trees and pre-processes the XML data to avoid destruction and ParseErrors.
+    """
 
-    # with open("apple_health_export_2/new_export.xml", "w") as f:
-    with open("processed_export.xml", "w") as f:
-        f.write(newText)
+    print("Pre-processing...", end="")
+    sys.stdout.flush()
 
+    xml_string = strip_dtd(xml_string)
+    xml_string = strip_invisible_character(xml_string)
     print("done!")
 
-    return
+    return xml_string
+
+
+def strip_invisible_character(xml_string):
 
+    return xml_string.replace("\x0b", "")
 
-def convert_xml():
+
+def strip_dtd(xml_string):
+    start_strip = re.search('<!DOCTYPE', xml_string).span()[0]
+    end_strip = re.search(']>', xml_string).span()[1]
+
+    return xml_string[:start_strip] + xml_string[end_strip:]
+
+
+def xml_to_csv(xml_string):
     """Loops through the element tree, retrieving all objects, and then
     combining them together into a dataframe
     """
 
-    print("Converting XML File...", end="")
-    etree = ET.parse("processed_export.xml")
+    print("Converting XML File to CSV...", end="")
+    sys.stdout.flush()
+
+    etree = ET.ElementTree(ET.fromstring(xml_string))
 
     attribute_list = []
 
@@ -78,13 +97,16 @@ def convert_xml():
 
     # Add loop specific column ordering if metadata entries exist
     if 'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate' in original_cols:
-        shifted_cols.append('com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate')
+        shifted_cols.append(
+            'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate')
 
     if 'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate' in original_cols:
-        shifted_cols.append('com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate')
+        shifted_cols.append(
+            'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate')
 
     if 'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes' in original_cols:
-        shifted_cols.append('com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes')
+        shifted_cols.append(
+            'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes')
 
     remaining_cols = list(set(original_cols) - set(shifted_cols))
     reordered_cols = shifted_cols + remaining_cols
@@ -100,6 +122,8 @@ def convert_xml():
 
 def save_to_csv(health_df):
     print("Saving CSV file...", end="")
+    sys.stdout.flush()
+
     today = dt.datetime.now().strftime('%Y-%m-%d')
     health_df.to_csv("apple_health_export_" + today + ".csv", index=False)
     print("done!")
@@ -108,8 +132,9 @@ def save_to_csv(health_df):
 
 
 def main():
-    pre_process()
-    health_df = convert_xml()
+    xml_string = open("export.xml").read()
+    xml_string = pre_process(xml_string)
+    health_df = xml_to_csv(xml_string)
     save_to_csv(health_df)
 
     return