Added project files

Kaszanas · Aug 27, 2021 · b37590c · b37590c
1 parent d4d3138
commit b37590c
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 1 deletion.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Kaszanas
+Copyright (c) 2021 Andrzej Białecki
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -0,0 +1,55 @@
+# SC2DatasetPreparator
+
+This repository contains tools which can be used in order to perform the following steps:
+
+1. Using ```src/directory_flattener.py``` Flatten the directory structure and save the old directory tree to a mapping of ```{"replayUniqueHash": "whereItWasInOldStructure"}```
+2. Using ```src/sc2_replaypack_processor``` Perform replaypack processing with https://github.com/Kaszanas/SC2InfoExtractorGo
+
+## Customization
+
+In order to specify different processing flags for https://github.com/Kaszanas/SC2InfoExtractorGo please modify the ```src/sc2_replaypack_processor``` file directly
+
+## Usage
+
+Before using this software please Python >= 3.7 and ```requirements.txt```.
+
+Please keep in mind that ```src/directory_flattener.py``` does not contain default flag values and can be customized with the following command line flags:
+
+```
+usage: directory_flattener.py [-h] [--input_path INPUT_PATH]
+                              [--file_extension FILE_EXTENSION]
+
+Directory restructuring tool used in order to flatten the structure, map the
+old structure to a separate file, and for later processing with other tools.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --input_path INPUT_PATH
+                        Please provide input path to the dataset that is going
+                        to be processed.
+  --file_extension FILE_EXTENSION
+                        Please provide a file extension for files that will be
+                        moved and renamed.
+```
+
+
+Please keep in mind that the  ```src/sc2_replaypack_processor.py``` does not contain default flag values and can be customized with the following command line flags:
+
+```
+Tool used for processing SC2 datasets. with
+https://github.com/Kaszanas/SC2InfoExtractorGo
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --input_dir INPUT_DIR
+                        Please provide input path to the directory containing
+                        the dataset that is going to be processed.
+  --output_dir OUTPUT_DIR
+                        Please provide an output directory for the resulting
+                        files.
+  --number_of_processes NUMBER_OF_PROCESSES
+                        Please provide the number of processes to be spawn for
+                        the dataset processing.
+```
+
+# Citation
diff --git a/requirements.txt b/requirements.txt
diff --git a/src/directory_flattener.py b/src/directory_flattener.py
@@ -0,0 +1,45 @@
+import os
+import argparse
+import uuid
+import json
+import glob
+import shutil
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Directory restructuring tool used in order to flatten the structure, map the old structure to a separate file, and for later processing with other tools.")
+    parser.add_argument("--input_path", help="Please provide input path to the dataset that is going to be processed.")
+    parser.add_argument("--file_extension", help="Please provide a file extension for files that will be moved and renamed.")
+    args = parser.parse_args()
+
+
+    dir_structure_mapping = {}
+
+    # Iterate over the supplied directory:
+    for root, _, filename in os.walk(args.input_path):
+        # Performing action for every file that was detected
+        for file in filename:
+            if file.endswith(args.file_extension):
+
+                # Prepare relative paths:
+                relative_dir = os.path.relpath(root, args.input_path)
+                relative_file = os.path.join(relative_dir, file)
+                # Get unique filename:
+                unique_filename = uuid.uuid4().hex
+
+                # Create directory if it doesn't exist:
+                new_root_directory = args.input_path + "_processed"
+                if not os.path.exists(new_root_directory):
+                    os.makedirs(new_root_directory)
+
+                # Moving and renaming files:
+                current_file = os.path.join(root, file)
+                new_path_and_filename = os.path.join(new_root_directory, unique_filename+args.file_extension)
+
+                # Copying files:
+                shutil.copy(current_file, new_path_and_filename)
+
+                # Add to a mapping
+                dir_structure_mapping[unique_filename] = relative_file
+
+    with open(os.path.join(args.input_path+"_processed", "processed_mapping.json"), "w") as json_file:
+        json.dump(dir_structure_mapping, json_file)
diff --git a/src/sc2_replaypack_processor.py b/src/sc2_replaypack_processor.py
@@ -0,0 +1,59 @@
+import os
+import argparse
+import subprocess
+from tqdm import tqdm
+from multiprocessing import Pool
+
+
+def multiprocessing_scheduler(processing_arguments):
+    with Pool(processes=12) as pool:
+        pool.imap_unordered(multiprocessing_client, processing_arguments)
+        pool.close()
+        pool.join()
+
+
+def multiprocessing_client(arguments:tuple):
+
+    directory, output_directory_filepath = arguments
+
+    subprocess.run(["GoSC2Science.exe",
+                    f"-input={directory}/",
+                    f"-output={output_directory_filepath}/",
+                    "-integrity_check=false",
+                    "-validity_check=false",
+                    "-number_of_packages=1",
+                    "-game_mode=0b1111111111",
+                    "-localized_maps_file=F:\\Projects\\EsportDataset\\processed\\program\\new_maps_processed.json",
+                    "-perform_anonymization=false",
+                    "-localize_maps=true",
+                    "-with_multiprocessing=false",
+                    "-log_level=3",
+                    f"-log_dir={output_directory_filepath}/"])
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Tool used for processing SC2 datasets. with https://github.com/Kaszanas/SC2InfoExtractorGo")
+    parser.add_argument("--input_dir", help="Please provide input path to the directory containing the dataset that is going to be processed.")
+    parser.add_argument("--output_dir", help="Please provide an output directory for the resulting files.")
+    parser.add_argument("--number_of_processes", help="Please provide the number of processes to be spawned for the dataset processing.")
+    args = parser.parse_args()
+
+    multiprocessing_list = []
+    for directory, _, file in tqdm(os.walk(args.input_dir)):
+
+        # Create the main output directory:
+        if not os.path.exists(args.output_dir):
+            os.mkdir(args.output_dir)
+
+        output_directory_name = directory.split("\\")[-1]
+        if output_directory_name == "input":
+            continue
+
+        output_directory_filepath = os.path.join(args.output_dir, output_directory_name)
+
+        # Create the output subdirectories:
+        if not os.path.exists(output_directory_filepath):
+            os.mkdir(output_directory_filepath)
+
+        multiprocessing_list.append((directory, output_directory_filepath))
+
+    multiprocessing_scheduler(multiprocessing_list)