workforce-data-initiative · thcrock · Nov 8, 2018 · Nov 7, 2018 · Nov 7, 2018 · Nov 7, 2018
diff --git a/skills_ml/algorithms/embedding/train.py b/skills_ml/algorithms/embedding/train.py
@@ -142,6 +142,7 @@ def save_model(self, storage=None):
             ms = self.model_storage
 
         for model in self._models:
+            model.storage = ms.storage
             ms.save_model(model, model.model_name)
             logging.info(f"{model.model_name} has been stored to {ms.storage.path}.")
 

diff --git a/skills_ml/algorithms/occupation_classifiers/__init__.py b/skills_ml/algorithms/occupation_classifiers/__init__.py
@@ -78,7 +78,7 @@ class FullSOC(TargetVariable):
     def __init__(self, filters=None, onet_cache=None):
         super().__init__(filters)
         self.default_filters = [unknown_soc_filter, empty_soc_filter]
-        self.choices = Onet().all_soc
+        self.choices = onet_cache.all_soc if onet_cache else Onet().all_soc
         self.encoder = SocEncoder(self.choices)
 
     def extract_occupation_from_jobposting(self, job_posting):
@@ -120,6 +120,7 @@ def _combine_pipelines(self):
         return combined
 
     def build(self):
+        logging.info("Building matrix")
         for i, item in enumerate(self._combine_pipelines()):
             self._X.append(item[0])
             self._y.append(item[1])

diff --git a/skills_ml/algorithms/occupation_classifiers/train.py b/skills_ml/algorithms/occupation_classifiers/train.py
@@ -68,6 +68,9 @@ def train(self, save=True):
         is vailable in this package's environment and implements .fit
         """
         logging.info(f"Start training {self.train_time}")
+        if len(self.matrix.X) == 0:
+            self.matrix.build()
+
         X = self.matrix.X
         y = self.matrix.y
         store_path = os.path.join(self.storage.path, self.train_time)
@@ -82,7 +85,7 @@ def train(self, save=True):
                 kf = StratifiedKFold(n_splits=self.k_folds, random_state=self.random_state_for_split)
                 model_hash = self._model_hash(self.matrix.metadata, class_name, parameter_config)
                 trained_model_name = class_name.lower() + "_" + model_hash
-                self.storage.path = os.path.join(store_path, score, trained_model_name)
+                self.storage.path = os.path.join(store_path, score)
                 if 'n_jobs' in inspect.signature(cls).parameters.keys():
                     cls_cv = ProxyObjectWithStorage(
                             model_obj=GridSearchCV(

diff --git a/skills_ml/algorithms/sampling/methods.py b/skills_ml/algorithms/sampling/methods.py
@@ -30,7 +30,7 @@ def reservoir(it, k):
         yield result.pop()
 
 
-def reservoir_weighted(it, k, weights):
+def reservoir_weighted(it, k, weights, key):
     """Weighted reservoir Sampling from job posting iterator
 
     Randomly choosing a sample of k items from a streaming iterator based on the weights.
@@ -51,7 +51,7 @@ def reservoir_weighted(it, k, weights):
     heap = []
     hkey = lambda w: np.power(np.random.uniform(0.0, 1.0), 1.0 / w)
     for i, datum in enumerate(it):
-        weight = weights[datum[1]]
+        weight = weights[key(datum)]
         score = hkey(weight)
         if len(heap) < k:
             hq.heappush(heap, (hkey(weight), datum))

diff --git a/skills_ml/evaluation/embedding_metrics.py b/skills_ml/evaluation/embedding_metrics.py
@@ -42,7 +42,7 @@ def eval(self, vectorization: Callable) -> Dict:
         result = {}
         for concept, entities in self.clustering.items():
             centroid = np.average([vectorization(entity[1]) for entity in entities], axis=0)
-            result[concept] = distance.cosine(vectorization(concept), centroid)
+            result[concept] = distance.cosine(vectorization(concept), centroid).astype(float)
         self.eval_result = result
         return result
 
@@ -65,7 +65,7 @@ def eval(self, vectorization: Callable) -> Dict:
         for concept, entities in self.clustering.items():
             entities_vec = [vectorization(entity[1]) for entity in entities]
             centroid = np.average(entities_vec, axis=0)
-            result[concept] = np.sum((entities_vec - centroid)**2)
+            result[concept] = np.sum((entities_vec - centroid)**2).astype(float)
         self.eval_result = result
         return result
 

diff --git a/skills_ml/evaluation/occ_cls_evaluator.py b/skills_ml/evaluation/occ_cls_evaluator.py
@@ -12,7 +12,7 @@ def __init__(self, result_generator):
         else:
             self.target_variable = self.result_generator.target_variable
             self.labels = self.target_variable.choices
-        self.result = np.array(list(result_generator))
+        self.result = np.array(list(self.result_generator))
 
     @cachedproperty
     def y_pred(self):
@@ -68,14 +68,13 @@ def micro_f1(self):
 
 
 class OnetOccupationClassificationEvaluator(ClassificationEvaluator):
-    def __init__(self,result_generator):
+    def __init__(self, result_generator):
         super().__init__(result_generator)
-        if not hasattr(self.result_generator,'target_variable'):
+        if not hasattr(self.result_generator, 'target_variable'):
             raise AttributeError("the result_generator should have target_variable property")
         else:
             self.target_variable = self.result_generator.target_variable
             self.labels = self.target_variable.choices
-        self.result = np.array(list(result_generator))
 
     @cachedproperty
     def _result_for_major_group(self):

diff --git a/skills_ml/job_postings/sample.py b/skills_ml/job_postings/sample.py
@@ -14,47 +14,26 @@ class JobSampler(object):
     Attributes:
         job_posting_generator (iterator): Job posting iterator to sample from.
         k (int): number of documents to sample
-        major_group (bool): A flag for using major_group as a label or not
-        keys (list|str): a key or keys(for nested dictionary) indicates the label which should exist in common schema
-                         of job posting.
         weights (dict): a dictionary that has key-value pairs as label-weighting pairs. It expects every
                         label in the iterator to be present as a key in the weights dictionary For example,
                         weights = {'11': 2, '13', 1}. In this case, the label/key is the occupation major
                         group and the value is the weight you want to sample with.
+        key (callable): a function to be called on each element to associate to the key of weights dictionary
         random_state (int): the seed used by the random number generator
 
     """
-    def __init__(self, job_posting_generator, k, major_group=False, keys=None, weights=None, random_state=None):
+    def __init__(self, job_posting_generator, k, weights=None, key=lambda x: x, random_state=None):
         self.job_posting_generator = job_posting_generator
         self.k = k
-        self.major_group = major_group
+        self.key = key
         self.weights = weights
-        self.keys = keys
         self.random_state = random_state
         if random_state:
             np.random.seed(random_state)
             random.seed(random_state)
 
-    def _transform_generator(self, job_posting_generator):
-        if isinstance(self.keys, list):
-            for job in job_posting_generator:
-                yield (job, safe_get(job, *self.keys))
-        elif isinstance(self.keys, str):
-            for job in job_posting_generator:
-                yield (job, job[self.keys])
-        elif self.major_group:
-            for job in job_posting_generator:
-                try:
-                    yield (job, job['onet_soc_code'][:2])
-                except TypeError:
-                    yield (job, None)
-        else:
-            for job in job_posting_generator:
-                yield (job, )
-
     def __iter__(self):
-        it = self._transform_generator(self.job_posting_generator)
         if self.weights:
-            yield from reservoir_weighted(it, self.k, self.weights)
+            yield from reservoir_weighted(self.job_posting_generator, self.k, self.weights, self.key)
         else:
-            yield from reservoir(it, self.k)
+            yield from reservoir(self.job_posting_generator, self.k)
diff --git a/skills_ml/ontologies/onet.py b/skills_ml/ontologies/onet.py
@@ -5,29 +5,29 @@
 import logging
 
 majorgroupname = {
-    '11': 'Management Occupations',
-    '13': 'Business and Financial Operations Occupations',
-    '15': 'Computer and Mathematical Occupations',
-    '17': 'Architecture and Engineering Occupations',
-    '19': 'Life, Physical, and Social Science Occupations',
-    '21': 'Community and Social Service Occupations',
-    '23': 'Legal Occupations',
-    '25': 'Education, Training, and Library Occupations',
-    '27': 'Arts, Design, Entertainment, Sports, and Media Occupations',
-    '29': 'Healthcare Practitioners and Technical Occupations',
-    '31': 'Healthcare Support Occupations',
-    '33': 'Protective Service Occupations',
-    '35': 'Food Preparation and Serving Related Occupations',
+    '11': 'Management',
+    '13': 'Business and Financial Operations',
+    '15': 'Computer and Mathematical',
+    '17': 'Architecture and Engineering',
+    '19': 'Life, Physical, and Social Science',
+    '21': 'Community and Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, and Library',
+    '27': 'Arts, Design, Entertainment, Sports, and Media',
+    '29': 'Healthcare Practitioners and Technical',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation and Serving Related',
     '37': 'Building and Grounds Cleaning and Maintenance',
-    '39': 'Personal Care and Service Occupations',
-    '41': 'Sales and Related Occupations',
-    '43': 'Office and Administrative Support Occupations',
-    '45': 'Farming, Fishing, and Forestry Occupations',
-    '47': 'Construction and Extraction Occupations',
-    '49': 'Installation, Maintenance, and Repair Occupations',
-    '51': 'Production Occupations',
-    '53': 'Transportation and Material Moving Occupations',
-    '55': 'Military Specific Occupations'
+    '39': 'Personal Care and Service',
+    '41': 'Sales and Related',
+    '43': 'Office and Administrative Support',
+    '45': 'Farming, Fishing, and Forestry',
+    '47': 'Construction and Extraction',
+    '49': 'Installation, Maintenance, and Repair',
+    '51': 'Production',
+    '53': 'Transportation and Material Moving',
+    '55': 'Military Specific'
 }
 
 

diff --git a/tests/job_postings/test_job_sampler.py b/tests/job_postings/test_job_sampler.py
@@ -1,4 +1,5 @@
 from skills_ml.job_postings.sample import JobSampler
+from skills_ml.job_postings.filtering import JobPostingFilterer
 from skills_utils.common import safe_get
 import gensim
 from collections import Counter
@@ -85,35 +86,35 @@ def test_soc(self):
 
         result = []
         for i in range(self.num_loops):
-            result.extend(list(map(lambda x: x[0]['onet_soc_code'], js)))
+            result.extend(list(map(lambda x: x['onet_soc_code'], js)))
 
         counts = dict(Counter(result))
         assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / self.occ_num
 
     def test_state(self):
+        transformer = lambda job: safe_get(job, 'jobLocation', 'address', 'addressRegion')
         js = JobSampler(
                 job_posting_generator=self.fake_corpus_train,
                 k=self.sample_size,
-                keys=['jobLocation', 'address', 'addressRegion']
         )
 
         result = []
         for i in range(self.num_loops):
-            result.extend(list(map(lambda x: x[1], js)))
+            result.extend(list(map(lambda x: transformer(x), js)))
 
         counts = dict(Counter(result))
         assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)
 
     def test_employment_type(self):
+        transformer = lambda job: safe_get(job, 'employmentType')
         js = JobSampler(
                 job_posting_generator=self.fake_corpus_train,
                 k=self.sample_size,
-                keys='employmentType'
         )
 
         result = []
         for i in range(self.num_loops):
-            result.extend(list(map(lambda x: x[1], js)))
+            result.extend(list(map(lambda x: transformer(x), js)))
 
         counts = dict(Counter(result))
         assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.employment_type)
@@ -131,15 +132,23 @@ def test_major_group(self):
 
         ratio = self.weights['13'] / self.weights['11']
 
+        major_group_filter = lambda job: job['onet_soc_code'][:2] in ['11', '13']
+
+        filtered_jobposting = JobPostingFilterer(
+                self.fake_corpus_train,
+                [major_group_filter]
+                )
+
         js = JobSampler(
-                job_posting_generator=self.fake_corpus_train,
+                job_posting_generator=filtered_jobposting,
                 k=self.sample_size,
                 weights=self.weights,
-                major_group=True)
+                key=lambda job: job['onet_soc_code'][:2]
+                )
 
         result = []
         for i in range(self.num_loops):
-            r = list(map(lambda x: x[1][:2], js))
+            r = list(map(lambda x: x['onet_soc_code'][:2], js))
             counts = dict(Counter(r))
             result.append(counts['13'] / counts['11'])
 

diff --git a/tests/ontologies/test_clustering.py b/tests/ontologies/test_clustering.py
@@ -34,11 +34,11 @@ def test_basic(self):
         assert set(occupation_clustering.keys()) == set([major_group_37_concept.name, major_group_35_concept.name])
         assert occupation_clustering["Building and Grounds Cleaning and Maintenance"] == major_group_37_entities
         assert occupation_clustering.map_raw_key["Building and Grounds Cleaning and Maintenance"] == major_group_37_concept
-        assert occupation_clustering["Food Preparation and Serving Related Occupations"] == major_group_35_entities
-        assert occupation_clustering.map_raw_key["Food Preparation and Serving Related Occupations"] == major_group_35_concept
+        assert occupation_clustering["Food Preparation and Serving Related"] == major_group_35_entities
+        assert occupation_clustering.map_raw_key["Food Preparation and Serving Related"] == major_group_35_concept
 
         # Delete
-        del occupation_clustering["Food Preparation and Serving Related Occupations"]
+        del occupation_clustering["Food Preparation and Serving Related"]
         assert len(occupation_clustering) == 1
 
         # Iterable