Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug Fixed and Couple Changes for Skils-ML Tour #251

Merged
merged 9 commits into from
Nov 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions skills_ml/algorithms/embedding/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def save_model(self, storage=None):
ms = self.model_storage

for model in self._models:
model.storage = ms.storage
ms.save_model(model, model.model_name)
logging.info(f"{model.model_name} has been stored to {ms.storage.path}.")

Expand Down
3 changes: 2 additions & 1 deletion skills_ml/algorithms/occupation_classifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class FullSOC(TargetVariable):
def __init__(self, filters=None, onet_cache=None):
super().__init__(filters)
self.default_filters = [unknown_soc_filter, empty_soc_filter]
self.choices = Onet().all_soc
self.choices = onet_cache.all_soc if onet_cache else Onet().all_soc
self.encoder = SocEncoder(self.choices)

def extract_occupation_from_jobposting(self, job_posting):
Expand Down Expand Up @@ -120,6 +120,7 @@ def _combine_pipelines(self):
return combined

def build(self):
logging.info("Building matrix")
for i, item in enumerate(self._combine_pipelines()):
self._X.append(item[0])
self._y.append(item[1])
Expand Down
5 changes: 4 additions & 1 deletion skills_ml/algorithms/occupation_classifiers/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def train(self, save=True):
is vailable in this package's environment and implements .fit
"""
logging.info(f"Start training {self.train_time}")
if len(self.matrix.X) == 0:
self.matrix.build()

X = self.matrix.X
y = self.matrix.y
store_path = os.path.join(self.storage.path, self.train_time)
Expand All @@ -82,7 +85,7 @@ def train(self, save=True):
kf = StratifiedKFold(n_splits=self.k_folds, random_state=self.random_state_for_split)
model_hash = self._model_hash(self.matrix.metadata, class_name, parameter_config)
trained_model_name = class_name.lower() + "_" + model_hash
self.storage.path = os.path.join(store_path, score, trained_model_name)
self.storage.path = os.path.join(store_path, score)
if 'n_jobs' in inspect.signature(cls).parameters.keys():
cls_cv = ProxyObjectWithStorage(
model_obj=GridSearchCV(
Expand Down
4 changes: 2 additions & 2 deletions skills_ml/algorithms/sampling/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def reservoir(it, k):
yield result.pop()


def reservoir_weighted(it, k, weights):
def reservoir_weighted(it, k, weights, key):
"""Weighted reservoir Sampling from job posting iterator

Randomly choosing a sample of k items from a streaming iterator based on the weights.
Expand All @@ -51,7 +51,7 @@ def reservoir_weighted(it, k, weights):
heap = []
hkey = lambda w: np.power(np.random.uniform(0.0, 1.0), 1.0 / w)
for i, datum in enumerate(it):
weight = weights[datum[1]]
weight = weights[key(datum)]
score = hkey(weight)
if len(heap) < k:
hq.heappush(heap, (hkey(weight), datum))
Expand Down
4 changes: 2 additions & 2 deletions skills_ml/evaluation/embedding_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def eval(self, vectorization: Callable) -> Dict:
result = {}
for concept, entities in self.clustering.items():
centroid = np.average([vectorization(entity[1]) for entity in entities], axis=0)
result[concept] = distance.cosine(vectorization(concept), centroid)
result[concept] = distance.cosine(vectorization(concept), centroid).astype(float)
self.eval_result = result
return result

Expand All @@ -65,7 +65,7 @@ def eval(self, vectorization: Callable) -> Dict:
for concept, entities in self.clustering.items():
entities_vec = [vectorization(entity[1]) for entity in entities]
centroid = np.average(entities_vec, axis=0)
result[concept] = np.sum((entities_vec - centroid)**2)
result[concept] = np.sum((entities_vec - centroid)**2).astype(float)
self.eval_result = result
return result

Expand Down
7 changes: 3 additions & 4 deletions skills_ml/evaluation/occ_cls_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(self, result_generator):
else:
self.target_variable = self.result_generator.target_variable
self.labels = self.target_variable.choices
self.result = np.array(list(result_generator))
self.result = np.array(list(self.result_generator))

@cachedproperty
def y_pred(self):
Expand Down Expand Up @@ -68,14 +68,13 @@ def micro_f1(self):


class OnetOccupationClassificationEvaluator(ClassificationEvaluator):
def __init__(self,result_generator):
def __init__(self, result_generator):
super().__init__(result_generator)
if not hasattr(self.result_generator,'target_variable'):
if not hasattr(self.result_generator, 'target_variable'):
raise AttributeError("the result_generator should have target_variable property")
else:
self.target_variable = self.result_generator.target_variable
self.labels = self.target_variable.choices
self.result = np.array(list(result_generator))

@cachedproperty
def _result_for_major_group(self):
Expand Down
31 changes: 5 additions & 26 deletions skills_ml/job_postings/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,47 +14,26 @@ class JobSampler(object):
Attributes:
job_posting_generator (iterator): Job posting iterator to sample from.
k (int): number of documents to sample
major_group (bool): A flag for using major_group as a label or not
keys (list|str): a key or keys(for nested dictionary) indicates the label which should exist in common schema
of job posting.
weights (dict): a dictionary that has key-value pairs as label-weighting pairs. It expects every
label in the iterator to be present as a key in the weights dictionary For example,
weights = {'11': 2, '13', 1}. In this case, the label/key is the occupation major
group and the value is the weight you want to sample with.
key (callable): a function to be called on each element to associate to the key of weights dictionary
random_state (int): the seed used by the random number generator

"""
def __init__(self, job_posting_generator, k, major_group=False, keys=None, weights=None, random_state=None):
def __init__(self, job_posting_generator, k, weights=None, key=lambda x: x, random_state=None):
self.job_posting_generator = job_posting_generator
self.k = k
self.major_group = major_group
self.key = key
self.weights = weights
self.keys = keys
self.random_state = random_state
if random_state:
np.random.seed(random_state)
random.seed(random_state)

def _transform_generator(self, job_posting_generator):
if isinstance(self.keys, list):
for job in job_posting_generator:
yield (job, safe_get(job, *self.keys))
elif isinstance(self.keys, str):
for job in job_posting_generator:
yield (job, job[self.keys])
elif self.major_group:
for job in job_posting_generator:
try:
yield (job, job['onet_soc_code'][:2])
except TypeError:
yield (job, None)
else:
for job in job_posting_generator:
yield (job, )

def __iter__(self):
it = self._transform_generator(self.job_posting_generator)
if self.weights:
yield from reservoir_weighted(it, self.k, self.weights)
yield from reservoir_weighted(self.job_posting_generator, self.k, self.weights, self.key)
else:
yield from reservoir(it, self.k)
yield from reservoir(self.job_posting_generator, self.k)
44 changes: 22 additions & 22 deletions skills_ml/ontologies/onet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,29 @@
import logging

majorgroupname = {
'11': 'Management Occupations',
'13': 'Business and Financial Operations Occupations',
'15': 'Computer and Mathematical Occupations',
'17': 'Architecture and Engineering Occupations',
'19': 'Life, Physical, and Social Science Occupations',
'21': 'Community and Social Service Occupations',
'23': 'Legal Occupations',
'25': 'Education, Training, and Library Occupations',
'27': 'Arts, Design, Entertainment, Sports, and Media Occupations',
'29': 'Healthcare Practitioners and Technical Occupations',
'31': 'Healthcare Support Occupations',
'33': 'Protective Service Occupations',
'35': 'Food Preparation and Serving Related Occupations',
'11': 'Management',
'13': 'Business and Financial Operations',
'15': 'Computer and Mathematical',
'17': 'Architecture and Engineering',
'19': 'Life, Physical, and Social Science',
'21': 'Community and Social Service',
'23': 'Legal',
'25': 'Education, Training, and Library',
'27': 'Arts, Design, Entertainment, Sports, and Media',
'29': 'Healthcare Practitioners and Technical',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation and Serving Related',
'37': 'Building and Grounds Cleaning and Maintenance',
'39': 'Personal Care and Service Occupations',
'41': 'Sales and Related Occupations',
'43': 'Office and Administrative Support Occupations',
'45': 'Farming, Fishing, and Forestry Occupations',
'47': 'Construction and Extraction Occupations',
'49': 'Installation, Maintenance, and Repair Occupations',
'51': 'Production Occupations',
'53': 'Transportation and Material Moving Occupations',
'55': 'Military Specific Occupations'
'39': 'Personal Care and Service',
'41': 'Sales and Related',
'43': 'Office and Administrative Support',
'45': 'Farming, Fishing, and Forestry',
'47': 'Construction and Extraction',
'49': 'Installation, Maintenance, and Repair',
'51': 'Production',
'53': 'Transportation and Material Moving',
'55': 'Military Specific'
}


Expand Down
25 changes: 17 additions & 8 deletions tests/job_postings/test_job_sampler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from skills_ml.job_postings.sample import JobSampler
from skills_ml.job_postings.filtering import JobPostingFilterer
from skills_utils.common import safe_get
import gensim
from collections import Counter
Expand Down Expand Up @@ -85,35 +86,35 @@ def test_soc(self):

result = []
for i in range(self.num_loops):
result.extend(list(map(lambda x: x[0]['onet_soc_code'], js)))
result.extend(list(map(lambda x: x['onet_soc_code'], js)))

counts = dict(Counter(result))
assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / self.occ_num

def test_state(self):
transformer = lambda job: safe_get(job, 'jobLocation', 'address', 'addressRegion')
js = JobSampler(
job_posting_generator=self.fake_corpus_train,
k=self.sample_size,
keys=['jobLocation', 'address', 'addressRegion']
)

result = []
for i in range(self.num_loops):
result.extend(list(map(lambda x: x[1], js)))
result.extend(list(map(lambda x: transformer(x), js)))

counts = dict(Counter(result))
assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)

def test_employment_type(self):
transformer = lambda job: safe_get(job, 'employmentType')
js = JobSampler(
job_posting_generator=self.fake_corpus_train,
k=self.sample_size,
keys='employmentType'
)

result = []
for i in range(self.num_loops):
result.extend(list(map(lambda x: x[1], js)))
result.extend(list(map(lambda x: transformer(x), js)))

counts = dict(Counter(result))
assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.employment_type)
Expand All @@ -131,15 +132,23 @@ def test_major_group(self):

ratio = self.weights['13'] / self.weights['11']

major_group_filter = lambda job: job['onet_soc_code'][:2] in ['11', '13']

filtered_jobposting = JobPostingFilterer(
self.fake_corpus_train,
[major_group_filter]
)

js = JobSampler(
job_posting_generator=self.fake_corpus_train,
job_posting_generator=filtered_jobposting,
k=self.sample_size,
weights=self.weights,
major_group=True)
key=lambda job: job['onet_soc_code'][:2]
)

result = []
for i in range(self.num_loops):
r = list(map(lambda x: x[1][:2], js))
r = list(map(lambda x: x['onet_soc_code'][:2], js))
counts = dict(Counter(r))
result.append(counts['13'] / counts['11'])

Expand Down
6 changes: 3 additions & 3 deletions tests/ontologies/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def test_basic(self):
assert set(occupation_clustering.keys()) == set([major_group_37_concept.name, major_group_35_concept.name])
assert occupation_clustering["Building and Grounds Cleaning and Maintenance"] == major_group_37_entities
assert occupation_clustering.map_raw_key["Building and Grounds Cleaning and Maintenance"] == major_group_37_concept
assert occupation_clustering["Food Preparation and Serving Related Occupations"] == major_group_35_entities
assert occupation_clustering.map_raw_key["Food Preparation and Serving Related Occupations"] == major_group_35_concept
assert occupation_clustering["Food Preparation and Serving Related"] == major_group_35_entities
assert occupation_clustering.map_raw_key["Food Preparation and Serving Related"] == major_group_35_concept

# Delete
del occupation_clustering["Food Preparation and Serving Related Occupations"]
del occupation_clustering["Food Preparation and Serving Related"]
assert len(occupation_clustering) == 1

# Iterable
Expand Down