diff --git a/profiler/__init__.py b/profiler/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/profiler/data.py b/profiler/data.py deleted file mode 100644 index d63f869c13..0000000000 --- a/profiler/data.py +++ /dev/null @@ -1,131 +0,0 @@ -import errno -import glob -import hashlib -import json -import os -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional - -import attr - - -@attr.define -class ProfileData: - """This class represents the data stored per run""" - - command: str - timestamp: float - stdout: str - stderr: str - tiledb_stats: Dict[str, Any] - somacore_version: str - tiledbsoma_version: str - host_context: Dict[str, str] - user_time_sec: float - system_time_sec: float - pct_of_cpu: float - elapsed_time_sec: float - avg_shared_text_sz_kb: int - avg_unshared_text_sz_kb: int - avg_stack_sz_kb: int - avg_total_sz_kb: int - max_res_set_sz_kb: int - avg_res_set_sz_kb: int - major_page_faults: int - minor_page_faults: int - voluntary_context_switches: int - involuntary_context_switches: int - swaps: int - file_system_inputs: int - file_system_outputs: int - socket_messages_sent: int - socket_messages_received: int - signals_delivered: int - page_size_bytes: int - exit_status: int - custom_out: List[Optional[str]] - - command_key: str = attr.field() - - @command_key.default - def _command_key_factory(self): - return _command_key(self.command) - - -DEFAULT_PROFILE_DB_PATH = "./profiling_db" - - -def _command_key(command: str) -> str: - """Remove space characters from profileDB command keys.""" - return hashlib.md5(command.encode("utf-8")).hexdigest() - - -class ProfileDB(ABC): - """Base class for profiler runs database""" - - @abstractmethod - def __str__(self): - pass - - @abstractmethod - def find(self, command): - pass - - @abstractmethod - def add(self, data: ProfileData): - pass - - @abstractmethod - def close(self): - pass - - -class FileBasedProfileDB(ProfileDB): - """Represents a file-based implementation of a ProfileDB - runs database. Each run is stored as a separate file under a subdirectory structured as `/`. - """ - - def __init__(self, path: str = DEFAULT_PROFILE_DB_PATH): - self.path = path - if not os.path.exists(self.path): - os.mkdir(self.path) - - def __str__(self): - result = "" - if os.path.exists(self.path): - for command_hash in glob.glob(self.path + "/*"): - with open(os.path.join(command_hash, "command.txt"), "r") as f: - command = f.read() - n_runs = len(glob.glob(os.path.join(command_hash, "*.json"))) - result += ( - f"[{command_hash.split('/')[-1]}] \"{command}\": {n_runs} runs\n" - ) - return result - return "" - - def find(self, command) -> List[ProfileData]: - key = _command_key(command) - if not os.path.exists(f"{self.path}/{key}"): - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), key) - result = [] - for filename in glob.glob(f"{self.path}/{key}/*.json"): - with open(filename, "r") as file: - result.append(ProfileData(**json.load(file))) - return result - - def add(self, data: ProfileData) -> str: - key = _command_key(data.command) - os.makedirs(f"{self.path}/{key}", exist_ok=True) - with open(f"{self.path}/{key}/command.txt", "w") as f: - f.write(data.command.strip()) - - key2 = data.timestamp - - filename = f"{self.path}/{key}/{key2}.json" - with open(filename, "w") as f: - json.dump(attr.asdict(data), f) - - return filename - - def close(self): - pass diff --git a/profiler/setup.py b/profiler/setup.py index 7921b399ce..c152f15c29 100644 --- a/profiler/setup.py +++ b/profiler/setup.py @@ -1,8 +1,9 @@ from setuptools import find_packages, setup setup( - name="soma-profiler", + name="profiler", version="1.0", - packages=find_packages(), + packages=find_packages("src"), + package_dir={"": "src"}, requires=["gitpython", "psutil", "tiledbsoma", "cellxgene_census"], ) diff --git a/profiler/src/profiler/__init__.py b/profiler/src/profiler/__init__.py new file mode 100644 index 0000000000..54b62fac32 --- /dev/null +++ b/profiler/src/profiler/__init__.py @@ -0,0 +1,5 @@ +from . import data + +__all__ = [ + "data", +] diff --git a/profiler/__main__.py b/profiler/src/profiler/__main__.py similarity index 100% rename from profiler/__main__.py rename to profiler/src/profiler/__main__.py diff --git a/profiler/context_generator.py b/profiler/src/profiler/context_generator.py similarity index 100% rename from profiler/context_generator.py rename to profiler/src/profiler/context_generator.py diff --git a/profiler/src/profiler/data.py b/profiler/src/profiler/data.py new file mode 100644 index 0000000000..0649ca642a --- /dev/null +++ b/profiler/src/profiler/data.py @@ -0,0 +1,226 @@ +import errno +import glob +import hashlib +import json +import os +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional +from botocore.client import ClientError +import tempfile +import attr +import boto3 +import uuid + +@attr.define +class ProfileData: + """This class represents the data stored per run""" + + command: str + timestamp: float + stdout: str + stderr: str + tiledb_stats: Dict[str, Any] + somacore_version: str + tiledbsoma_version: str + host_context: Dict[str, str] + user_time_sec: float + system_time_sec: float + pct_of_cpu: float + elapsed_time_sec: float + avg_shared_text_sz_kb: int + avg_unshared_text_sz_kb: int + avg_stack_sz_kb: int + avg_total_sz_kb: int + max_res_set_sz_kb: int + avg_res_set_sz_kb: int + major_page_faults: int + minor_page_faults: int + voluntary_context_switches: int + involuntary_context_switches: int + swaps: int + file_system_inputs: int + file_system_outputs: int + socket_messages_sent: int + socket_messages_received: int + signals_delivered: int + page_size_bytes: int + exit_status: int + custom_out: List[Optional[str]] + + command_key: str = attr.field() + + @command_key.default + def _command_key_factory(self): + return _command_key(self.command) + + +DEFAULT_PROFILE_DB_PATH = "./profiling_db" + + +def _command_key(command: str) -> str: + """Remove space characters from profileDB command keys.""" + return hashlib.md5(command.encode("utf-8")).hexdigest() + + +class ProfileDB(ABC): + """Base class for profiler runs database""" + + @abstractmethod + def __str__(self): + pass + + @abstractmethod + def find(self, command): + pass + + @abstractmethod + def add(self, data: ProfileData): + pass + + @abstractmethod + def close(self): + pass + + +class FileBasedProfileDB(ProfileDB): + """Represents a file-based implementation of a ProfileDB + runs database. Each run is stored as a separate file under a subdirectory structured as `/`. + """ + + def __init__(self, path: str = DEFAULT_PROFILE_DB_PATH): + self.path = path + if not os.path.exists(self.path): + os.mkdir(self.path) + + def __str__(self): + result = "" + if os.path.exists(self.path): + for command_hash in glob.glob(self.path + "/*"): + with open(os.path.join(command_hash, "command.txt"), "r") as f: + command = f.read() + n_runs = len(glob.glob(os.path.join(command_hash, "*.json"))) + result += ( + f"[{command_hash.split('/')[-1]}] \"{command}\": {n_runs} runs\n" + ) + return result + return "" + + def find(self, command) -> List[ProfileData]: + key = _command_key(command) + if not os.path.exists(f"{self.path}/{key}"): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), key) + result = [] + for filename in glob.glob(f"{self.path}/{key}/*.json"): + with open(filename, "r") as file: + result.append(ProfileData(**json.load(file))) + return result + + def add(self, data: ProfileData) -> str: + key = _command_key(data.command) + os.makedirs(f"{self.path}/{key}", exist_ok=True) + filename = f"{self.path}/{key}/command.txt" + + if not os.path.exists(filename): + with open(filename, "w") as f: + f.write(data.command.strip()) + key2 = data.timestamp + + filename = f"{self.path}/{key}/{key2}.json" + with open(filename, "w") as f: + json.dump(attr.asdict(data), f) + + return filename + + def close(self): + pass + +class S3ProfileDB(ProfileDB): + """Represents a S3-based implementation of a ProfileDB database. + Each run is stored as a separate S3 object under a key with the structure `//`. + """ + + def read_object_keys(self, prefix: str, suffix: str) -> List[str]: + # return all the objects kets starting with prefix and ending with suffix + result = self.s3.list_objects(Bucket=self.bucket_name, Prefix=prefix) + keys: List[str] = [] + for o in result.get('Contents'): + object_key = o.get('Key') + if object_key.endswith(suffix): + keys.append(object_key) + return keys + + def read_s3_text(self, key: str) -> str: + # Assume the key is associated with one object. Otherwise, return the first object + result = self.s3.list_objects(Bucket=self.bucket_name, Prefix=key) + for o in result.get('Contents'): + data = self.s3.get_object(Bucket=self.bucket_name, Key=o.get('Key')) + contents = data['Body'].read().decode("utf-8") + return contents + + + def bucket_exist_and_accessible(self): + try: + self.s3.head_bucket(Bucket=self.bucket.name) + except ClientError: + return False + return True + + def __init__(self, bucket_name: str): + # Initialize bucket's info + self.s3 = boto3.client('s3') + self.bucket_name = bucket_name + self.bucket = boto3.resource('s3').Bucket(self.bucket_name) + # Check if the bucket exists + if not self.bucket_exist_and_accessible(): + raise( + Exception(f"Bucket {self.bucket_name} does not exist or access is not granted.")) + + def __str__(self): + result = "" + if self.bucket_exist_and_accessible(): + for command_file_key in self.read_object_keys("", "/command.txt"): + # Extract runs prefix + runs_prefix = command_file_key.replace("/command.txt", "") + for data_file_key in self.read_object_keys(runs_prefix, ".json"): + # Read command. + command = self.read_s3_text(command_file_key) + # Get the number of runs. + n_runs = len(self.read_object_keys(runs_prefix, ".json")) + result =+ f"[{command_file_key}] \"{command}\": {n_runs} runs\n" + return result + return Exception(f"Bucket {self.bucket_name} does not exist or access is not granted.") + + def find(self, command) -> List[ProfileData]: + key = _command_key(command) + result = [] + # Extract all data files associated with this command + for file_key in self.read_object_keys(key, ".json"): + text = self.read_s3_text(file_key) + if len(text) > 0: + result.append(ProfileData(**json.loads(text))) + return result + + def add(self, data: ProfileData) -> str: + key = _command_key(data.command) + s3_command_key = f"{key}/command.txt" + + filename = str(uuid.uuid4()) + + with open(filename, "w") as fp: + fp.write(data.command.strip()) + self.s3.upload_file(os.path.abspath(str(fp.name)), self.bucket_name, s3_command_key) + os.unlink(filename) + + key2 = data.timestamp + data_key = f"{key}/{key2}.json" + filename = str(uuid.uuid4()) + + with open(filename, "w") as fp: + json.dump(attr.asdict(data), fp) + self.s3.upload_file(os.path.abspath(str(fp.name)), self.bucket_name, data_key) + os.unlink(filename) + + return data_key + + def close(self): + pass diff --git a/profiler/profiler.py b/profiler/src/profiler/profiler.py similarity index 95% rename from profiler/profiler.py rename to profiler/src/profiler/profiler.py index 282153219d..3716068af3 100644 --- a/profiler/profiler.py +++ b/profiler/src/profiler/profiler.py @@ -10,10 +10,13 @@ from typing import Any, Dict, Optional import somacore -from context_generator import host_context import tiledbsoma -from data import FileBasedProfileDB, ProfileData, ProfileDB + +from .context_generator import host_context + +# import context_generator +from .data import FileBasedProfileDB, ProfileData, ProfileDB, S3ProfileDB GNU_TIME_FORMAT = ( 'Command being timed: "%C"\n' @@ -96,6 +99,7 @@ def build_profile_data( custom_out=[prof1, prof2], **gnu_time_output_values, ) + return data @@ -117,6 +121,10 @@ def main(): "command", help="The command and its arguments to be profiled (as quoted, single-argument)", ) + parser.add_argument( + "db_path", + help="FileDB Path", + ) parser.add_argument( "-t", "--gtime-cmd", @@ -183,12 +191,13 @@ def main(): print( f"Third profiler {args.prof2} missing output flamegraph file location" ) - p_stdout, p_stderr = p.communicate() if p1 is not None: p1.wait() if p2 is not None: p2.wait() + print("Profiler run done", file=stderr) + print("Profiler run done", file=stderr) p_stdout = p_stdout.decode("utf-8") print(f"The benchmarked process output:\n {p_stdout}", file=stderr) @@ -197,7 +206,7 @@ def main(): p_stderr.decode("utf-8"), p_stdout, args.prof1_output, args.prof2_output ) # Add the run data to DB - db: ProfileDB = FileBasedProfileDB() + db: ProfileDB = S3ProfileDB(args.db_path) db_record_file = db.add(data) db.close() diff --git a/profiler/report.py b/profiler/src/profiler/report.py similarity index 100% rename from profiler/report.py rename to profiler/src/profiler/report.py