Merge pull request #32 from workforce-data-initiative/ksa_distinction

Add ONET KSA distinctions to skills_master [Resolves #30]
workforce-data-initiative · Mar 3, 2017 · b1d1b05 · b1d1b05
2 parents 774a5d5 + 9fff23c
commit b1d1b05
Show file tree

Hide file tree

Showing 14 changed files with 628 additions and 428 deletions.
diff --git a/algorithms/skill_extractors/onet_ksas.py b/algorithms/skill_extractors/onet_ksas.py
@@ -20,7 +20,7 @@ def __init__(self, onet_source, output_filename, hash_function):
         self.onet_source = onet_source
         self.hash_function = hash_function
 
-    def onet_to_pandas(self, filename, col_name, use_relevance=True):
+    def onet_to_pandas(self, filename, col_name, ksa_type, use_relevance=True):
         """
         Args:
             filename: an unpathed filename referring to an ONET skill file
@@ -48,6 +48,9 @@ def onet_to_pandas(self, filename, col_name, use_relevance=True):
                 else:
                     onet = [row for row in csv.DictReader(f, delimiter='\t')]
         onet = pd.DataFrame(onet)
+        if ksa_type:
+            col_name = col_name + ['ksa_type']
+            onet['ksa_type'] = ksa_type
 
         for col in col_name:
             onet[col] = onet[col].astype(str).str.lower()
@@ -61,18 +64,19 @@ def run(self):
         nlp = NLPTransforms()
         # create dataframes for each KSA type
         standard_columns = ['O*NET-SOC Code', 'Element ID', 'Element Name']
-        skills = self.onet_to_pandas('Skills.txt', standard_columns)
-        ability = self.onet_to_pandas('Abilities.txt', standard_columns)
-        knowledge = self.onet_to_pandas('Knowledge.txt', standard_columns)
+        skills = self.onet_to_pandas('Skills.txt', standard_columns, 'skill')
+        ability = self.onet_to_pandas('Abilities.txt', standard_columns, 'ability')
+        knowledge = self.onet_to_pandas('Knowledge.txt', standard_columns, 'knowledge')
         tools = self.onet_to_pandas(
             'Tools and Technology.txt',
             ['O*NET-SOC Code', 'Commodity Code', 'T2 Example'],
+            'tool',
             use_relevance=False
         )
 
         # Concat KSA dataframes into one table
         # note significant duplications since it's by ONET SOC Code
-        new_columns = ['O*NET-SOC Code', 'Element ID', 'ONET KSA']
+        new_columns = ['O*NET-SOC Code', 'Element ID', 'ONET KSA', 'ksa_type']
         skills.columns = new_columns
         ability.columns = new_columns
         knowledge.columns = new_columns
@@ -83,6 +87,7 @@ def run(self):
         onet_modelreference = self.onet_to_pandas(
             'Content Model Reference.txt',
             ['Element ID', 'Description'],
+            ksa_type=None,
             use_relevance=False
         )
 

diff --git a/api_sync/v1/alembic.ini b/api_sync/v1/alembic.ini
@@ -0,0 +1,65 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration files
+# file_template = %%(rev)s_%%(slug)s
+
+# max length of characters to apply to the
+# "slug" field
+#truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; this defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path
+# version_locations = %(here)s/bar %(here)s/bat alembic/versions
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/api_sync/v1/alembic/README b/api_sync/v1/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
diff --git a/api_sync/v1/alembic/env.py b/api_sync/v1/alembic/env.py
@@ -0,0 +1,75 @@
+from __future__ import with_statement
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+from logging.config import fileConfig
+from utils.db import get_apiv1_dburl
+from api_sync.v1.models import Base
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+db_url = str(get_apiv1_dburl())
+config.set_main_option('sqlalchemy.url', db_url)
+
+
+def run_migrations_offline():
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    context.configure(
+        url=db_url,
+        target_metadata=target_metadata, literal_binds=True)
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online():
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section),
+        prefix='sqlalchemy.',
+        poolclass=pool.NullPool)
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/api_sync/v1/alembic/script.py.mako b/api_sync/v1/alembic/script.py.mako
@@ -0,0 +1,23 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+def upgrade():
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade():
+    ${downgrades if downgrades else "pass"}
diff --git a/api_sync/v1/alembic/versions/d523e7e45456_added_ksa_type.py b/api_sync/v1/alembic/versions/d523e7e45456_added_ksa_type.py
@@ -0,0 +1,23 @@
+"""Added ksa_type
+
+Revision ID: d523e7e45456
+Revises: 
+Create Date: 2017-02-27 15:21:53.018866
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'd523e7e45456'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+def upgrade():
+    op.add_column('skills_master', sa.Column('ksa_type', sa.String))
+
+
+def downgrade():
+    op.drop_column('skills_master', 'ksa_type')
diff --git a/api_sync/v1/models/skills_master.py b/api_sync/v1/models/skills_master.py
@@ -11,16 +11,10 @@ class SkillMaster(Base):
 
     uuid = db.Column(db.String, primary_key=True)
     skill_name = db.Column(db.String)
+    ksa_type = db.Column(db.String)
     onet_element_id = db.Column(db.String)
     description = db.Column(db.String)
     nlp_a = db.Column(db.String)
 
-    def __init__(self, uuid, skill_name, onet_element_id, description, nlp_a):
-        self.uuid = uuid
-        self.skill_name = skill_name
-        self.onet_element_id = onet_element_id
-        self.description = description
-        self.nlp_a = nlp_a
-
     def __repr__(self):
         return '<uuid {}>'.format(self.uuid)
diff --git a/api_sync/v1/skills_master.py b/api_sync/v1/skills_master.py
@@ -9,12 +9,12 @@ def load_skills_master(filename, db_engine):
         reader = csv.DictReader(f, delimiter='\t')
         session = sessionmaker(db_engine)()
         for row in reader:
-            skill_master = SkillMaster(
+            session.merge(SkillMaster(
                 uuid=row['skill_uuid'],
                 skill_name=row['ONET KSA'],
+                ksa_type=row['ksa_type'],
                 onet_element_id=row['Element ID'],
                 description=row['Description'],
                 nlp_a=row['nlp_a']
-            )
-            session.merge(skill_master)
+            ))
         session.commit()
diff --git a/requirements.txt b/requirements.txt
@@ -22,3 +22,4 @@ Sqlalchemy
 testing.postgresql
 psycopg2
 mock
+alembic