diff --git a/.gitignore b/.gitignore index 65531cc..cb251d0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ *.swp *.swo third_party/* +core/domain/proto/*.py +!core/domain/proto/__init__.py diff --git a/core/classifiers/CodeClassifier/CodeClassifier.py b/core/classifiers/CodeClassifier/CodeClassifier.py index ff80ecf..4f2ddf7 100644 --- a/core/classifiers/CodeClassifier/CodeClassifier.py +++ b/core/classifiers/CodeClassifier/CodeClassifier.py @@ -538,6 +538,22 @@ def __init__(self): # text into a feature a vector. self.count_vector = None + @property + def name_in_job_result_proto(self): + # This property needs to be defined as it is defined as + # abstract property in BaseClassifier. However, since code classifier + # is currently disabled, the proto message definitions for code + # classifier will not be added until it is re-implemented and enabled. + return 'code_classifier' + + @property + def type_in_job_result_proto(self): + # This property needs to be defined as it is defined as + # abstract property in BaseClassifier. However, since code classifier + # is currently disabled, the proto message definitions for code + # classifier will not be added until it is re-implemented and enabled. + return '%sFrozenModel' % self.__class__.__name__ + def to_dict(self): """Returns a dict representing this classifier. diff --git a/core/classifiers/TextClassifier/TextClassifier.py b/core/classifiers/TextClassifier/TextClassifier.py index 7db7f6f..968e89c 100644 --- a/core/classifiers/TextClassifier/TextClassifier.py +++ b/core/classifiers/TextClassifier/TextClassifier.py @@ -35,6 +35,7 @@ class TextClassifier(base.BaseClassifier): Support Vector Classifier (SVC) to obtain the best model using the linear kernel. """ + def __init__(self): super(TextClassifier, self).__init__() # sklearn.svm.SVC classifier object. @@ -54,6 +55,14 @@ def __init__(self): # Time taken to train the classifier self.exec_time = None + @property + def name_in_job_result_proto(self): + return 'text_classifier' + + @property + def type_in_job_result_proto(self): + return '%sFrozenModel' % (self.__class__.__name__) + def train(self, training_data): """Trains classifier using given training_data. diff --git a/core/classifiers/base.py b/core/classifiers/base.py index f96109d..e03d1c4 100644 --- a/core/classifiers/base.py +++ b/core/classifiers/base.py @@ -36,6 +36,22 @@ class BaseClassifier(object): def __init__(self): pass + @property + @abc.abstractproperty + def name_in_job_result_proto(self): + """A property that identifies the attribute in job result proto message + which will store this classifier's classifier data. + """ + raise NotImplementedError + + @property + @abc.abstractproperty + def type_in_job_result_proto(self): + """The type of the property in job result proto message which stores + this classifier's classifier data. + """ + raise NotImplementedError + @abc.abstractmethod def to_dict(self): """Returns a dict representing this classifier. diff --git a/core/services/__init__.py b/core/domain/__init__.py similarity index 100% rename from core/services/__init__.py rename to core/domain/__init__.py diff --git a/core/services/job_services.py b/core/domain/job_services.py similarity index 99% rename from core/services/job_services.py rename to core/domain/job_services.py index 2368379..cf22815 100644 --- a/core/services/job_services.py +++ b/core/domain/job_services.py @@ -18,7 +18,7 @@ from core.classifiers import algorithm_registry from core.classifiers import classifier_utils -from core.services import remote_access_services +from core.domain import remote_access_services # pylint: disable=too-many-branches def _validate_job_data(job_data): diff --git a/core/domain/proto/__init__.py b/core/domain/proto/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/domain/proto/text_classifier.proto b/core/domain/proto/text_classifier.proto new file mode 100644 index 0000000..d75e390 --- /dev/null +++ b/core/domain/proto/text_classifier.proto @@ -0,0 +1,23 @@ +// coding: utf-8 +// +// Copyright 2020 The Oppia Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS-IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +message TextClassifierFrozenModel { + // The parameters of a trained text classifier model which are necessary + // for inference. + string model_json = 1; +} diff --git a/core/domain/proto/training_job_response_payload.proto b/core/domain/proto/training_job_response_payload.proto new file mode 100644 index 0000000..e0234f4 --- /dev/null +++ b/core/domain/proto/training_job_response_payload.proto @@ -0,0 +1,44 @@ +// coding: utf-8 +// +// Copyright 2020 The Oppia Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS-IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +import "core/domain/proto/text_classifier.proto"; + +// Training job response payload contains job result of the training job +// along with other metadata items such as vm_id (to identify which VM executed +// this job) and signature of the payload for security purpose. +message TrainingJobResponsePayload { + // Job result of the training job. Job result contains the ID of the Job and + // trained model (frozen model) of the job. + message JobResult { + // Id of the training job whose data is being stored. + string job_id = 1; + + // Each of the classifier algorithms' proto message must be present in + // the oneof classifier_data field. + oneof classifier_frozen_model { + TextClassifierFrozenModel text_classifier = 2; + } + } + JobResult job_result = 1; + + // Id of the VM instance that trained the job. + string vm_id = 2; + + // Signature of the job data for authenticated communication. + string signature = 3; +} diff --git a/core/services/remote_access_services.py b/core/domain/remote_access_services.py similarity index 100% rename from core/services/remote_access_services.py rename to core/domain/remote_access_services.py diff --git a/core/services/remote_access_services_test.py b/core/domain/remote_access_services_test.py similarity index 98% rename from core/services/remote_access_services_test.py rename to core/domain/remote_access_services_test.py index 9d068b4..a1154dd 100644 --- a/core/services/remote_access_services_test.py +++ b/core/domain/remote_access_services_test.py @@ -16,7 +16,7 @@ """Tests for remote access services.""" -from core.services import remote_access_services +from core.domain import remote_access_services from core.tests import test_utils import vmconf diff --git a/core/domain/training_job_result_domain.py b/core/domain/training_job_result_domain.py new file mode 100644 index 0000000..18707c0 --- /dev/null +++ b/core/domain/training_job_result_domain.py @@ -0,0 +1,81 @@ +# coding: utf-8 +# +# Copyright 2020 The Oppia Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functions and classdefs related to protobuf files used in Oppia-ml""" + +from core.classifiers import algorithm_registry +from core.domain.proto import training_job_response_payload_pb2 + +class TrainingJobResult(object): + """TrainingJobResult domain object. + + This domain object stores the of training job result along with job_id and + algorithm_id. The training job result is the trained classifier data. + """ + + def __init__(self, job_id, algorithm_id, classifier_data): + """Initializes TrainingJobResult object. + + Args: + job_id: str. The id of the training job whose results are stored + in classifier_data. + algorithm_id: str. The id of the algorithm of the training job. + classifier_data: object. Frozen model of the corresponding + training job. + """ + self.job_id = job_id + self.algorithm_id = algorithm_id + self.classifier_data = classifier_data + + def validate(self): + """Validate that TrainigJobResult object stores correct data. + + Raises: + Exception: str. The classifier data is stored in a field + that does not correspond to algorithm_id. + """ + + # Ensure that the classifier_data is corresponds to the classifier + # having given algorithm_id. + classifier = algorithm_registry.Registry.get_classifier_by_algorithm_id( + self.algorithm_id) + if ( + type(self.classifier_data).__name__ != + classifier.type_in_job_result_proto): + raise Exception( + "Expected classifier data of type %s but found %s type" % ( + classifier.type_in_job_result_proto, + type(self.classifier_data).__name__)) + + def to_proto(self): + """Generate TrainingJobResult protobuf object from the TrainingJobResult + domain object. + + Returns: + TrainingJobResult protobuf object. Protobuf object corresponding to + TrainingJobResult protobuf message definition. + """ + self.validate() + proto_message = ( + training_job_response_payload_pb2. + TrainingJobResponsePayload.JobResult()) + proto_message.job_id = self.job_id + job_result_attribute = ( + algorithm_registry.Registry.get_classifier_by_algorithm_id( + self.algorithm_id).name_in_job_result_proto) + getattr(proto_message, job_result_attribute).CopyFrom( + self.classifier_data) + return proto_message diff --git a/core/domain/training_job_result_domain_test.py b/core/domain/training_job_result_domain_test.py new file mode 100644 index 0000000..986b45a --- /dev/null +++ b/core/domain/training_job_result_domain_test.py @@ -0,0 +1,90 @@ +# coding: utf-8 +# +# Copyright 2020 The Oppia Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for training job result domain""" + +from core.classifiers import algorithm_registry +from core.domain import training_job_result_domain +from core.domain.proto import training_job_response_payload_pb2 +from core.domain.proto import text_classifier_pb2 +from core.tests import test_utils + +class TrainingJobResultTests(test_utils.GenericTestBase): + """Tests for TrainingJobResult domain object.""" + + def test_validate_job_data_with_valid_model_does_not_raise_exception(self): # pylint: disable=no-self-use + """Ensure that validation checks do not raise exceptions when + a valid classifier model is supplied. + """ + job_id = 'job_id' + algorithm_id = 'TextClassifier' + classifier_data = text_classifier_pb2.TextClassifierFrozenModel() + classifier_data.model_json = 'dummy model' + job_result = training_job_result_domain.TrainingJobResult( + job_id, algorithm_id, classifier_data) + job_result.validate() + + def test_validate_job_data_with_invalid_model_raises_exception(self): + """Ensure that validation checks raise exception when + an invalid classifier model is supplied. + """ + job_id = 'job_id' + algorithm_id = 'TextClassifier' + classifier_data = 'simple classifier' + job_result = training_job_result_domain.TrainingJobResult( + job_id, algorithm_id, classifier_data) + with self.assertRaisesRegexp( + Exception, + 'Expected classifier data of type TextClassifier'): + job_result.validate() + + def test_that_all_algorithms_have_job_result_information(self): + """Test that all algorithms have properties to identify name and type + of attribute in job result proto which stores classifier data for that + algorithm. + """ + job_result_proto = ( + training_job_response_payload_pb2. + TrainingJobResponsePayload.JobResult()) + for classifier in algorithm_registry.Registry.get_all_classifiers(): + self.assertIsNotNone(classifier.name_in_job_result_proto) + attribute_type_name = type(getattr( + job_result_proto, classifier.name_in_job_result_proto)).__name__ + self.assertEqual( + attribute_type_name, classifier.type_in_job_result_proto) + + def test_that_training_job_result_proto_is_generated_with_correct_details( + self): + """Ensure that the JobResult proto is correctly generated from + TrainingJobResult domain object. + """ + classifier_data = text_classifier_pb2.TextClassifierFrozenModel() + classifier_data.model_json = 'dummy model' + job_id = 'job_id' + algorithm_id = 'TextClassifier' + classifier = algorithm_registry.Registry.get_classifier_by_algorithm_id( + algorithm_id) + job_result = training_job_result_domain.TrainingJobResult( + job_id, algorithm_id, classifier_data) + job_result_proto = job_result.to_proto() + + # Lint test for no-member needs to be disabled as protobuf generated + # classes are metaclasses and hence their attributes are defined at + # runtime. + self.assertEqual(job_result_proto.job_id, job_id) # pylint: disable=no-member + self.assertEqual( + job_result_proto.WhichOneof('classifier_frozen_model'), + classifier.name_in_job_result_proto) diff --git a/core/platform/protobuf/__init__.py b/core/platform/protobuf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py index 7b901a0..7349bf0 100644 --- a/main.py +++ b/main.py @@ -26,7 +26,7 @@ vm_config.configure() # pylint: disable=wrong-import-position -from core.services import job_services +from core.domain import job_services import vmconf def main(): diff --git a/manifest.txt b/manifest.txt index 54964be..52667e6 100644 --- a/manifest.txt +++ b/manifest.txt @@ -24,3 +24,4 @@ scikit-learn 0.18.1 pylint 1.7.1 requests 2.17.1 responses 0.5.1 +protobuf 3.12.0 diff --git a/prototool.yaml b/prototool.yaml new file mode 100644 index 0000000..869e9d9 --- /dev/null +++ b/prototool.yaml @@ -0,0 +1,8 @@ +protoc: + version: 3.8.0 +lint: + group: google +generate: + plugins: + - name: python + output: ./ diff --git a/scripts/pre_commit_linter.py b/scripts/pre_commit_linter.py index e46b62d..791f48a 100644 --- a/scripts/pre_commit_linter.py +++ b/scripts/pre_commit_linter.py @@ -97,7 +97,7 @@ EXCLUDED_PATHS = ( 'third_party/*', '.git/*', '*.pyc', 'CHANGELOG', - 'scripts/pre_commit_linter.py') + 'scripts/pre_commit_linter.py', 'core/domain/proto/*.py') if not os.getcwd().endswith('oppia-ml'): print '' @@ -105,6 +105,8 @@ _PYLINT_PATH = os.path.join(os.getcwd(), 'third_party', 'pylint-1.7.1') _MANIFEST_FILE_PATH = os.path.join(os.getcwd(), 'manifest.txt') +_PROTOTOOL_PATH = os.path.join( + os.getcwd(), 'third_party', 'prototool-1.9.0', 'prototool') if not os.path.exists(_PYLINT_PATH): print '' @@ -112,6 +114,11 @@ print ' and its dependencies.' sys.exit(1) +if not os.path.exists(_PROTOTOOL_PATH): + print '' + print 'ERROR Please run start.sh first to install prototool' + sys.exit(1) + # Fix third-party library paths. with open(_MANIFEST_FILE_PATH, 'r') as f: _PATHS_TO_INSERT = [ @@ -191,9 +198,6 @@ def _lint_py_files(config_pylint, files_to_lint, result): config_pylint: str. Path to the .pylintrc file. files_to_lint: list(str). A list of filepaths to lint. result: multiprocessing.Queue. A queue to put results of test. - - Returns: - None """ start_time = time.time() are_there_errors = False @@ -236,6 +240,44 @@ def _lint_py_files(config_pylint, files_to_lint, result): print 'Python linting finished.' +def _lint_proto_files(files_to_lint, result): + """Prints a list of lint errors in the given list of Proto files. + + Args: + files_to_lint: list(str). A list of filepaths to lint. + result: multiprocessing.Queue. A queue to put results of test. + """ + start_time = time.time() + are_there_errors = False + + num_proto_files = len(files_to_lint) + if not files_to_lint: + result.put('') + print 'There are no Proto files to lint.' + return + + print 'Linting %s Proto files' % num_proto_files + + _BATCH_SIZE = 50 + current_batch_start_index = 0 + + for proto_file in files_to_lint: + print('Linting %s' % proto_file) + try: + subprocess.check_output( + [_PROTOTOOL_PATH, 'lint', proto_file]) + except subprocess.CalledProcessError as e: + print(e.output) + are_there_errors = True + + if are_there_errors: + result.put('%s Proto linting failed' % _MESSAGE_TYPE_FAILED) + else: + result.put('%s %s Proto files linted (%.1f secs)' % ( + _MESSAGE_TYPE_SUCCESS, num_proto_files, time.time() - start_time)) + + print 'Proto linting finished.' + def _get_all_files(): """This function is used to check if this script is ran from root directory and to return a list of all the files for linting and @@ -285,21 +327,44 @@ def _pre_commit_linter(all_files): py_files_to_lint = [ filename for filename in all_files if filename.endswith('.py')] + proto_files_to_lint = [ + filename for filename in all_files if filename.endswith('.proto')] - linting_processes = [] + python_linting_processes = [] py_result = multiprocessing.Queue() - linting_processes.append(multiprocessing.Process( + python_linting_processes.append(multiprocessing.Process( target=_lint_py_files, args=(config_pylint, py_files_to_lint, py_result))) print 'Starting Python Linting' print '----------------------------------------' - for process in linting_processes: + for process in python_linting_processes: + process.start() + + for process in python_linting_processes: + # Require timeout parameter to prevent against endless waiting for the + # linting function to return. + process.join(timeout=600) + + print '' + print '----------------------------------------' + + proto_linting_processes = [] + + proto_result = multiprocessing.Queue() + proto_linting_processes.append(multiprocessing.Process( + target=_lint_proto_files, + args=(proto_files_to_lint, proto_result))) + + print 'Starting Proto Linting' + print '----------------------------------------' + + for process in proto_linting_processes: process.start() - for process in linting_processes: + for process in proto_linting_processes: # Require timeout parameter to prevent against endless waiting for the # linting function to return. process.join(timeout=600) @@ -311,6 +376,7 @@ def _pre_commit_linter(all_files): # Require block = False to prevent unnecessary waiting for the process # output. summary_messages.append(py_result.get(block=False)) + summary_messages.append(proto_result.get(block=False)) print '\n'.join(summary_messages) print '' return summary_messages diff --git a/scripts/setup.sh b/scripts/setup.sh index 5957e03..023192e 100644 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -85,4 +85,23 @@ export PYTHON_CMD # Set PYTHONPATH. export PYTHONPATH=$OPPIA_ML_DIR:$PYTHONPATH +# Set prototool path. +export PROTOTOOL_PATH=$THIRD_PARTY_DIR/prototool-1.9.0 +export PROTOTOOL=$PROTOTOOL_PATH/prototool + +echo Checking if prototool is installed in $PROTOTOOL_PATH + +if [ ! -d "$PROTOTOOL_PATH" ]; then + echo Installing prototool + mkdir "$PROTOTOOL_PATH" + curl -sSL "https://github.com/uber/prototool/releases/download/v1.9.0/prototool-$(uname -s)-$(uname -m)" \ + -o "$PROTOTOOL_PATH/prototool" + chmod +x "$PROTOTOOL_PATH/prototool" +fi + +# Compile proto files +echo Compiling protobuf files +$PROTOTOOL generate core/domain/proto +echo protobuf files compilation done + export SETUP_DONE=true diff --git a/vmconf.py b/vmconf.py index a005198..5ae2867 100644 --- a/vmconf.py +++ b/vmconf.py @@ -63,7 +63,7 @@ # Algorithm IDs of different classifier algorithms. These IDs are used to obtain # instance of classifier algorithm using algorithm_registry. # Note: we need same IDs in Oppia as well. -ALGORITHM_IDS = ['CodeClassifier', 'TextClassifier'] +ALGORITHM_IDS = ['TextClassifier'] # Path of the directory which stores classifiers. CLASSIFIERS_DIR = os.path.join('core', 'classifiers')