Added mapping from raw TopMed variables to harmonized. Also

including the script and setup for generating that mapping.
RTIInternational · Dec 10, 2024 · f944ba3 · f944ba3
1 parent 6492bca
commit f944ba3
Show file tree

Hide file tree

Showing 6 changed files with 2,392 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# random
+.DS_Store
+.idea
diff --git a/README.md b/README.md
@@ -15,6 +15,10 @@ Please read the [Collaboration Guide](CONTRIBUTING.md) before contributing.
     * [datamodel](src/bdchm/datamodel) -- generated
       Python datamodel
 * [tests/](tests/) - Python tests
+* [resources/](resources/)
+  * [map-topmed-raw-to-harmonized/](resources/map-topmed-raw-to-harmonized/)
+    * -- script for generating mapping from TopMed raw variables
+      to TopMed harmonized variables
 
 ## Developer Documentation
 

diff --git a/resources/map-topmed-raw-to-harmonized/README.md b/resources/map-topmed-raw-to-harmonized/README.md
@@ -0,0 +1,12 @@
+### Generate mapping from TopMed raw variables to TopMed harmonized variables
+
+#### Files
+
+* [harmonized-variable-documentation](harmonized-variable-documentation)
+  -- symbolic link to local clone of
+  [UW-GAC/topmed-dcc-harmonized-phenotypes](https://github.com/UW-GAC/topmed-dcc-harmonized-phenotypes/tree/master/harmonized-variable-documentation).
+  Clone that repo in the same directory as this repo, and the symbolin
+  link should work.
+* [generate_mapping.py](generate_mapping.py) -- script
+* [raw-to-harmonized-topmed-vars.csv](raw-to-harmonized-topmed-vars.csv)
+  -- output
diff --git a/resources/map-topmed-raw-to-harmonized/generate_mapping.py b/resources/map-topmed-raw-to-harmonized/generate_mapping.py
@@ -0,0 +1,81 @@
+import json
+import csv
+from pathlib import Path
+
+
+def process_json_files(root_dir):
+    """
+    Process JSON files in the given directory and its subdirectories to extract
+    raw and harmonized variable mappings.
+
+    Args:
+        root_dir (str): Path to the root directory containing JSON files
+
+    Returns:
+        list: List of tuples containing (raw_var, harmonized_var) mappings
+    """
+    # Convert the root directory path to a Path object
+    root_path = Path(root_dir).resolve()
+
+    # Store the mappings
+    mappings = []
+
+    # Find all JSON files recursively
+    for json_file in root_path.glob('**/*.json'):
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+
+            harmonized_name = data['name']
+
+            # Extract harmonization units
+            if 'harmonization_units' in data:
+                for unit in data['harmonization_units']:
+                    # Get the harmonized name
+                    harmonization_unit_name = unit['name']
+
+                    # Get the raw variable names
+                    raw_vars = unit.get('component_study_variables', [])
+
+                    # Add each raw variable mapping
+                    for raw_var in raw_vars:
+                        mappings.append((harmonized_name, harmonization_unit_name, raw_var))
+
+        except Exception as e:
+            print(f"Error processing {json_file}: {str(e)}")
+
+    return mappings
+
+
+def write_csv(mappings, output_file):
+    """
+    Write the mappings to a CSV file.
+
+    Args:
+        mappings (list): List of tuples containing (raw_var, harmonized_var) mappings
+        output_file (str): Path to the output CSV file
+    """
+    with open(output_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+        # Write header
+        writer.writerow(['harmonized', 'harmonized_unit', 'raw'])
+        # Write mappings
+        writer.writerows(mappings)
+
+
+def main():
+    # Define input directory and output file
+    input_dir = "harmonized-variable-documentation"
+    output_file = "raw-to-harmonized-topmed-vars.csv"
+
+    # Process JSON files and get mappings
+    mappings = process_json_files(input_dir)
+
+    # Write mappings to CSV
+    write_csv(mappings, output_file)
+
+    print(f"Processing complete. Results written to {output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/resources/map-topmed-raw-to-harmonized/harmonized-variable-documentation b/resources/map-topmed-raw-to-harmonized/harmonized-variable-documentation
@@ -0,0 +1 @@
+../../../topmed-dcc-harmonized-phenotypes/harmonized-variable-documentation
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../topmed-dcc-harmonized-phenotypes/harmonized-variable-documentation