Bugfix and testing

rhandberg · rhandberg · commit 80e5a039ed23 · 2022-08-31T14:40:09.000+02:00
diff --git a/run_training.py b/run_training.py
@@ -10,6 +10,7 @@
 
 import argparse
 import os
+import sys
 import logging
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 import tensorflow as tf
@@ -60,7 +61,7 @@ def main():
 
 	# Setup logging:
 	formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-	console = logging.StreamHandler()
+	console = logging.StreamHandler(sys.stdout)
 	console.setFormatter(formatter)
 	console.setLevel(logging_level)
 	logger = logging.getLogger('starclass')
diff --git a/starclass/taskmanager.py b/starclass/taskmanager.py
@@ -316,7 +316,7 @@ def get_number_tasks(self, classifier=None):
 		return num
 
 	#----------------------------------------------------------------------------------------------
-	def _query_task(self, classifier=None, priority=None, chunk=1):
+	def _query_task(self, classifier=None, priority=None, chunk=1, ignore_existing=False):
 
 		search_joins = []
 		search_query = []
@@ -330,7 +330,7 @@ def _query_task(self, classifier=None, priority=None, chunk=1):
 			search_query.append(f'temp.starclass_todolist.priority={priority:d}')
 
 		# If a classifier is specified, constrain to only that classifier:
-		if classifier is not None:
+		if classifier is not None and not ignore_existing:
 			search_joins.append(f"LEFT JOIN starclass_diagnostics ON starclass_diagnostics.priority=temp.starclass_todolist.priority AND starclass_diagnostics.classifier='{classifier:s}'")
 			search_query.append("starclass_diagnostics.status IS NULL")
 
@@ -411,7 +411,7 @@ def _query_task(self, classifier=None, priority=None, chunk=1):
 		return None
 
 	#----------------------------------------------------------------------------------------------
-	def get_task(self, priority=None, classifier=None, change_classifier=True, chunk=1):
+	def get_task(self, priority=None, classifier=None, change_classifier=True, chunk=1, ignore_existing=False):
 		"""
 		Get next task to be processed.
 
@@ -432,7 +432,7 @@ def get_task(self, priority=None, classifier=None, change_classifier=True, chunk
 		.. codeauthor:: Rasmus Handberg <rasmush@phys.au.dk>
 		"""
 
-		task = self._query_task(classifier=classifier, priority=priority, chunk=chunk)
+		task = self._query_task(classifier=classifier, priority=priority, chunk=chunk, ignore_existing=ignore_existing)
 
 		# If no task is returned for the given classifier, find another
 		# classifier where tasks are available:
@@ -441,7 +441,7 @@ def get_task(self, priority=None, classifier=None, change_classifier=True, chunk
 			# task for all of them:
 			all_tasks = []
 			for cl in self.all_classifiers.difference([classifier]):
-				task = self._query_task(classifier=cl, priority=priority, chunk=chunk)
+				task = self._query_task(classifier=cl, priority=priority, chunk=chunk, ignore_existing=ignore_existing)
 				if task is not None:
 					all_tasks.append(task)
 
@@ -454,7 +454,7 @@ def get_task(self, priority=None, classifier=None, change_classifier=True, chunk
 
 			# If this is reached, all classifiers are done, and we can
 			# start running the MetaClassifier:
-			task = self._query_task(classifier='meta', priority=priority, chunk=chunk)
+			task = self._query_task(classifier='meta', priority=priority, chunk=chunk, ignore_existing=ignore_existing)
 
 		return task
 
diff --git a/starclass/training_sets/testing_tset.py b/starclass/training_sets/testing_tset.py
@@ -21,7 +21,7 @@ class testing_tset(TrainingSet):
 	.. codeauthor:: Rasmus Handberg <rasmush@phys.au.dk>
 	"""
 	# Class constants:
-	key = 'testtset'
+	key = 'testing'
 	datadir = 'keplerq9v3'
 	_todo_name = 'todo-testing'
 
diff --git a/starclass/training_sets/training_set.py b/starclass/training_sets/training_set.py
@@ -459,7 +459,7 @@ def features(self):
 		with BaseClassifier(tset=self, features_cache=self.features_cache) as stcl:
 			for rowidx in self.train_idx:
 				task = self.tm.get_task(priority=rowidx+1, classifier=cl,
-					change_classifier=False, chunk=1)
+					change_classifier=False, chunk=1, ignore_existing=True)
 
 				# Lightcurve file to load:
 				# We do not use the one from the database because in the simulations the
@@ -487,7 +487,7 @@ def features_test(self):
 		# when opened several times in parallel.
 		for rowidx in self.test_idx:
 			task = self.tm.get_task(priority=rowidx+1, classifier=cl,
-				change_classifier=False, chunk=1)
+				change_classifier=False, chunk=1, ignore_existing=True)
 
 			# Lightcurve file to load:
 			# We do not use the one from the database because in the simulations the
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -41,6 +41,7 @@ def capture_run_cli(cli, params=[], mpiexec=False):
 	if mpiexec:
 		cmd = ['mpiexec', '-n', '2'] + cmd
 
+	print("Running command: " + ' '.join(cmd))
 	proc = subprocess.Popen(cmd,
 		cwd=os.path.join(os.path.dirname(__file__), '..'),
 		stdout=subprocess.PIPE,
diff --git a/tests/input/meta/keplerq9v3-tset.sqlite b/tests/input/meta/keplerq9v3-tset.sqlite
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c483fbd9dae17610a6c08931b3590d43f45d44eb27ad883bcabbb80c39b92ef0
+size 23379968
diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py
@@ -26,42 +26,27 @@
 AVAILABLE_CLASSIFIERS.remove('meta')
 
 #--------------------------------------------------------------------------------------------------
-@pytest.mark.parametrize('classifier', AVAILABLE_CLASSIFIERS) # FIXME:  + ['meta']
+@pytest.mark.parametrize('classifier', AVAILABLE_CLASSIFIERS + ['meta'])
 def test_classifiers_train_test(monkeypatch, SHARED_INPUT_DIR, classifier):
 
 	stcl = starclass.get_classifier(classifier)
 
-	# Pick out a task to use for testing:
-	with starclass.TaskManager(SHARED_INPUT_DIR) as tm:
-		task1 = tm.get_task(classifier=classifier, change_classifier=False, chunk=1)[0]
-		print(task1)
-
 	with tempfile.TemporaryDirectory(prefix='starclass-testing-') as tmpdir:
 		if classifier == 'meta':
 			# For the MetaClassifier, we need to manipulate the training-set
 			# a little bit before we can train. We have to mimic that
 			# all the other classifiers have already been trained and cross-validated
 			# in order to fill up the training-set todo-file with probabilities
 			# which the MetaClassifier uses for training.
-			tsetclass = starclass.get_trainingset('keplerq9v3')
-			input_folder = tsetclass.find_input_folder()
-
-			# Create a copy of the root files of the trainings set (ignore that actual data)
-			# in the temp. directory:
-			tsetdir = os.path.join(tmpdir, os.path.basename(input_folder))
-			print("New dummy input folder: %s" % tsetdir)
-			os.makedirs(tsetdir)
-			for f in os.listdir(input_folder):
-				fpath = os.path.join(input_folder, f)
-				if os.path.isfile(fpath) and not f.endswith(('.sqlite', '.sqlite-journal')):
-					shutil.copy(fpath, tsetdir)
 
 			# Change the environment variable to the temp. dir:
 			monkeypatch.setenv("STARCLASS_TSETS", tmpdir)
 
 			# Copy the pre-prepared todo-file to the training-set directory:
-			prepared_todo = os.path.join(SHARED_INPUT_DIR, 'meta', 'todo.sqlite')
+			tsetclass = starclass.get_trainingset('keplerq9v3')
+			prepared_todo = os.path.join(SHARED_INPUT_DIR, 'meta', 'keplerq9v3-tset.sqlite')
 			new_todo = os.path.join(tsetclass.find_input_folder(), tsetclass._todo_name + '.sqlite')
+			os.makedirs(os.path.dirname(new_todo), exist_ok=True)
 			shutil.copyfile(prepared_todo, new_todo)
 
 			# Initialize the training-set in the temp folder,
@@ -72,6 +57,14 @@ def test_classifiers_train_test(monkeypatch, SHARED_INPUT_DIR, classifier):
 			tsetclass = starclass.get_trainingset('testing')
 			tset = tsetclass(tf=0.2, random_seed=42)
 
+		print(tset)
+		print(tset.fake_metaclassifier)
+
+		# Pick out a task to use for testing:
+		with starclass.TaskManager(tset.todo_file, load_into_memory=False, classes=tset.StellarClasses) as tm:
+			task1 = tm.get_task(classifier=classifier, change_classifier=False, chunk=1)[0]
+			print(task1)
+
 		# Initialize the classifier and run training and testing:
 		with stcl(tset=tset, features_cache=None, data_dir=tmpdir) as cl:
 			print(cl.data_dir)
@@ -132,27 +125,47 @@ def test_classifiers_train_test(monkeypatch, SHARED_INPUT_DIR, classifier):
 		assert results1[key] == results2[key], "Non-identical results before and after saving/loading model"
 
 #--------------------------------------------------------------------------------------------------
-@pytest.mark.parametrize('classifier', AVAILABLE_CLASSIFIERS)
-def test_run_training(PRIVATE_INPUT_DIR, classifier):
+@pytest.mark.parametrize('classifier', AVAILABLE_CLASSIFIERS) # FIXME: + ['meta']
+def test_run_training_and_starclass(monkeypatch, PRIVATE_INPUT_DIR, classifier):
+	with tempfile.TemporaryDirectory(prefix='starclass-testing-') as tmpdir:
+		if classifier == 'meta':
+			# For the MetaClassifier, we need to manipulate the training-set
+			# a little bit before we can train. We have to mimic that
+			# all the other classifiers have already been trained and cross-validated
+			# in order to fill up the training-set todo-file with probabilities
+			# which the MetaClassifier uses for training.
+
+			# Change the environment variable to the temp. dir:
+			monkeypatch.setenv("STARCLASS_TSETS", tmpdir)
 
-	tsetclass = starclass.get_trainingset('testing')
-	tset = tsetclass(tf=0.2, random_seed=42)
+			# Copy the pre-prepared todo-file to the training-set directory:
+			tsetclass = starclass.get_trainingset('keplerq9v3')
+			prepared_todo = os.path.join(PRIVATE_INPUT_DIR, 'meta', 'keplerq9v3-tset.sqlite')
+			new_todo = os.path.join(tsetclass.find_input_folder(), tsetclass._todo_name + '.sqlite')
+			os.makedirs(os.path.dirname(new_todo), exist_ok=True)
+			shutil.copyfile(prepared_todo, new_todo)
+
+			tset = tsetclass(tf=0.2, random_seed=42)
+			tset.fake_metaclassifier = True
+		else:
+			tsetclass = starclass.get_trainingset('testing')
+			tset = tsetclass(tf=0.2, random_seed=42)
 
-	with tempfile.TemporaryDirectory(prefix='starclass-testing-') as tmpdir:
 		logfile = os.path.join(tmpdir, 'training.log')
 		todo_file = os.path.join(PRIVATE_INPUT_DIR, 'todo_run.sqlite')
 
 		# Train the classifier:
 		out, err, exitcode = capture_run_cli('run_training.py', [
 			'--classifier=' + classifier,
-			'--trainingset=testing',
+			'--trainingset=' + tset.key,
 			'--level=L1',
 			'--testfraction=0.2',
 			'--log=' + logfile,
 			'--log-level=info',
 			'--output=' + tmpdir
 		])
 		assert exitcode == 0
+		assert ' - INFO - Done.' in out
 
 		# Check that a log-file was indeed generated:
 		assert os.path.isfile(logfile), "Log-file not generated"
@@ -174,7 +187,7 @@ def test_run_training(PRIVATE_INPUT_DIR, classifier):
 				'--debug',
 				'--overwrite',
 				'--classifier=' + classifier,
-				'--trainingset=testing',
+				'--trainingset=' + tset.key,
 				'--level=L1',
 				'--datadir=' + tmpdir,
 				todo_file
@@ -188,10 +201,10 @@ def test_run_training(PRIVATE_INPUT_DIR, classifier):
 
 				cursor.execute("SELECT * FROM starclass_settings;")
 				row = cursor.fetchall()
-				assert len(row) == 1, "Only one settings row should exist"
+				assert len(row) == 1, "Exactly one settings row should exist"
 				settings = row[0]
 				print(dict(settings))
-				assert settings['tset'] == 'testtset'
+				assert settings['tset'] == tset.key
 
 				cursor.execute("SELECT * FROM starclass_diagnostics WHERE priority=17;")
 				row = cursor.fetchone()
@@ -208,12 +221,12 @@ def test_run_training(PRIVATE_INPUT_DIR, classifier):
 					print(dict(row))
 					assert row['priority'] == 17
 					assert row['classifier'] == classifier
-					tset.StellarClasses[row['class']] # Will result in KeyError of not correct
+					tset.StellarClasses[row['class']] # Will result in KeyError if not correct
 					assert 0 <= row['prob'] <= 1, "Invalid probability"
 
 				cursor.execute("SELECT * FROM starclass_features_common;")
 				results = cursor.fetchall()
-				assert len(results) == 1
+				assert len(results) == 1, "Exactly one features_common row should exist"
 				row = dict(results[0])
 				print(row)
 				assert row['priority'] == 17
@@ -222,7 +235,7 @@ def test_run_training(PRIVATE_INPUT_DIR, classifier):
 				if classifier != 'slosh':
 					cursor.execute(f"SELECT * FROM starclass_features_{classifier:s};")
 					results = cursor.fetchall()
-					assert len(results) == 1
+					assert len(results) == 1, f"Exactly one features_{classifier:s} row should exist"
 					row = dict(results[0])
 					print(row)
 					assert row['priority'] == 17

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:c483fbd9dae17610a6c08931b3590d43f45d44eb27ad883bcabbb80c39b92ef0`
	`3`	`+size 23379968`