Merge pull request #2 from seasidesparrow/main

Dev setup and testing
adsabs · Feb 15, 2024 · 09d24e5 · 09d24e5
2 parents 25240a2 + 46651f0
commit 09d24e5
Show file tree

Hide file tree

Showing 22 changed files with 993 additions and 14 deletions.
diff --git a/.github/ISSUE_TEMPLATE/1-bug_report.md b/.github/ISSUE_TEMPLATE/1-bug_report.md
@@ -0,0 +1,14 @@
+---
+name: Bug report
+about: Create a report to help us improve
+labels: bug
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.md b/.github/ISSUE_TEMPLATE/2-feature-request.md
@@ -0,0 +1,14 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+labels: enhancement
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/labels.toml b/.github/labels.toml
@@ -0,0 +1,79 @@
+[breaking]
+color = "ffcc00"
+name = "breaking"
+description = "Breaking change."
+
+[bug]
+color = "d73a4a"
+name = "bug"
+description = "Something isn't working"
+
+[dependencies]
+color = "0366d6"
+name = "dependencies"
+description = "Pull requests that update a dependency file"
+
+[github_actions]
+color = "000000"
+name = "github_actions"
+description = "Update of github actions"
+
+[documentation]
+color = "1bc4a5"
+name = "documentation"
+description = "Improvements or additions to documentation"
+
+[duplicate]
+color = "cfd3d7"
+name = "duplicate"
+description = "This issue or pull request already exists"
+
+[enhancement]
+color = "a2eeef"
+name = "enhancement"
+description = "New feature or request"
+
+["good first issue"]
+color = "7057ff"
+name = "good first issue"
+description = "Good for newcomers"
+
+["help wanted"]
+color = "008672"
+name = "help wanted"
+description = "Extra attention is needed"
+
+[invalid]
+color = "e4e669"
+name = "invalid"
+description = "This doesn't seem right"
+
+[nochangelog]
+color = "555555"
+name = "nochangelog"
+description = "Exclude pull requests from changelog"
+
+[question]
+color = "d876e3"
+name = "question"
+description = "Further information is requested"
+
+[removed]
+color = "e99695"
+name = "removed"
+description = "Removed piece of functionalities."
+
+[tests]
+color = "bfd4f2"
+name = "tests"
+description = "CI, CD and testing related changes"
+
+[wontfix]
+color = "ffffff"
+name = "wontfix"
+description = "This will not be worked on"
+
+[discussion]
+color = "c2e0c6"
+name = "discussion"
+description = "Some discussion around the project"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,29 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+exclude: "CHANGELOG.md"
+
+ci:
+  autofix_commit_msg: "chore(pre-commit.ci): auto fixes"
+  autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate"
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-json
+      - id: debug-statements
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: 23.1.0
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ Storage and infrastructure for affiliation data (used for augment pipeline, and
 
 ## Installation
 
-Install via pip: 
+Install via pip:
 
 ```
 pip install git+https://github.com/seasidesparrow/ADSAffilDB

diff --git a/adsaffildb/app.py b/adsaffildb/app.py
@@ -0,0 +1,6 @@
+from adsputils import ADSCelery
+
+
+class ADSAffilDBCelery(ADSCelery):
+    def __init__(self, app_name, *args, **kwargs):
+        ADSCelery.__init__(self, app_name, *args, **kwargs)
diff --git a/adsaffildb/models.py b/adsaffildb/models.py
@@ -0,0 +1,57 @@
+try:
+    from adsputils import UTCDateTime, get_date
+except ImportError:
+    from adsmutils import get_date, UTCDateTime
+
+from sqlalchemy import Column, Integer, String, Text
+from sqlalchemy.ext.declarative import declarative_base
+
+Base = declarative_base()
+
+
+class AffilData(Base):
+    """
+    affil_data holds the mapping of published string and affiliation ID
+    """
+
+    __tablename__ = "affil_data"
+
+    data_key = Column(Integer, primary_key=True, unique=True)
+    data_id = Column(String(6), nullable=False)
+    data_pubstring = Column(Text, unique=True, nullable=False)
+    created = Column(UTCDateTime, default=get_date)
+    updated = Column(UTCDateTime, onupdate=get_date)
+
+
+class AffilInst(Base):
+    __tablename__ = "affil_inst"
+
+    inst_key = Column(Integer, primary_key=True, unique=True)
+    inst_id = Column(String(6), unique=True, nullable=False)
+    inst_parents = Column(String, nullable=True)
+    inst_canonical = Column(String, nullable=False)
+    inst_abbreviation = Column(String, nullable=False)
+    inst_country = Column(String, nullable=True)
+    # in place of location, we could consider using GeoAlchemy2 here
+    # especially if we can get lat-lon from ROR
+    inst_location = Column(String, nullable=True)
+    inst_rorid = Column(String, nullable=True)
+    inst_notes = Column(Text, nullable=True)
+    created = Column(UTCDateTime, default=get_date)
+
+
+class AffilNorm(Base):
+    __tablename__ = "affil_norm"
+
+    norm_key = Column(Integer, primary_key=True, unique=True)
+    norm_id = Column(String(6), unique=False, nullable=False)
+    norm_string = Column(Text, unique=True, nullable=False)
+
+
+class AffilCuration(Base):
+    __tablename__ = "affil_curation"
+
+    curation_key = Column(Integer, primary_key=True, unique=True)
+    curation_count = Column(Integer, nullable=True)
+    curation_id = Column(String(6), unique=False, nullable=True)
+    curation_string = Column(Text, unique=True, nullable=False)
diff --git a/adsaffildb/normalize.py b/adsaffildb/normalize.py
@@ -0,0 +1,74 @@
+import html
+import re
+
+
+class FixSemicolonsException(Exception):
+    pass
+
+
+class CleanStringException(Exception):
+    pass
+
+
+class NormalizeStringException(Exception):
+    pass
+
+
+class BatchNormalizeException(Exception):
+    pass
+
+
+regex_norm_semicolon = re.compile(r";\s*;")
+regex_norm_punct = re.compile(r"[-!?.,;:/\\]")
+
+
+# BEGIN utils also used by ADSAffilPipeline
+def fix_semicolons(string):
+    try:
+        string_x = regex_norm_semicolon.sub(";", string).strip()
+        if string_x != string:
+            return fix_semicolons(string_x)
+        else:
+            return string_x
+    except Exception as err:
+        raise FixSemicolonsException("Error in fix_semicolons: %s" % err)
+
+
+def clean_string(string):
+    try:
+        string = html.unescape(string)
+        string = fix_semicolons(string)
+        string = string.strip(";").strip()
+        return string
+    except Exception as err:
+        raise CleanStringException("Error in clean_string: %s" % err)
+
+
+def normalize_string(string):
+    # normalizing consists of
+    # 1) removing all spaces and other punctuation with re
+    # 2) converting all ascii chars to upper-case
+    try:
+        string = regex_norm_punct.sub(" ", string)
+        string = " ".join(string.split())
+        string = string.upper()
+        return string
+    except Exception as err:
+        raise NormalizeStringException("Error in normalize_string: %s" % err)
+
+
+def normalize_batch(data):
+    try:
+        output = []
+        seen = {}
+        for rec in data:
+            newstring = rec[1]
+            if newstring:
+                newstring = normalize_string(clean_string(newstring))
+                if not seen.get(newstring, None):
+                    outrec = {"norm_id": rec[0], "norm_string": newstring}
+                    seen[newstring] = rec[0]
+                output.append(outrec)
+        return output
+    except Exception as err:
+        raise BatchNormalizeException("Failed to normalize batch: %s" % err)
diff --git a/adsaffildb/tasks.py b/adsaffildb/tasks.py
@@ -0,0 +1,65 @@
+import json
+import math
+import os
+
+from kombu import Queue
+from sqlalchemy import func
+
+from adsaffildb import app as app_module
+from adsaffildb import normalize, utils
+from adsaffildb.models import AffilData as affil_data
+from adsaffildb.models import AffilNorm as affil_norm
+
+proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
+app = app_module.ADSAffilDBCelery(
+    "affildb-pipeline",
+    proj_home=proj_home,
+    config=globals().get("config", {}),
+    local_config=globals().get("local_config", {}),
+)
+logger = app.logger
+
+app.conf.CELERY_QUEUES = (Queue("normalize", app.exchange, routing_key="normalize"),)
+
+
+def task_bulk_insert_data(table, data):
+    with app.session_scope() as session:
+        try:
+            session.bulk_insert_mappings(table, data)
+            session.commit()
+        except Exception as err:
+            session.rollback()
+            session.flush()
+            logger.warning("Failed to bulk insert data: %s" % err)
+
+
+def task_bulk_update_data(table, data):
+    with app.session_scope() as session:
+        try:
+            session.bulk_update_mappings(table, data)
+            session.commit()
+        except Exception as err:
+            session.rollback()
+            session.flush()
+            logger.warning("Failed to bulk update data: %s" % err)
+
+
+def task_normalize_affils():
+    with app.session_scope() as session:
+        try:
+            results = session.query(affil_data.data_id, affil_data.data_pubstring).all()
+            logger.warning("Results is of type %s" % type(results))
+            norm_results = normalize.normalize_batch(results)
+        except Exception as err:
+            logger.warning("Failed to normalize data: %s" % err)
+        else:
+            try:
+                session.query(affil_norm).delete()
+                session.commit()
+                logger.info("AffilNorm table cleared.")
+            except Exception as err:
+                session.rollback()
+                session.flush()
+                logger.error("Failed to clear AffilNorm table: %s" % err)
+            else:
+                task_bulk_insert_data(affil_norm, norm_results)