Skip to content

Commit

Permalink
Merge pull request #2 from seasidesparrow/main
Browse files Browse the repository at this point in the history
Dev setup and testing
  • Loading branch information
seasidesparrow committed Feb 15, 2024
2 parents 25240a2 + 46651f0 commit 09d24e5
Show file tree
Hide file tree
Showing 22 changed files with 993 additions and 14 deletions.
14 changes: 14 additions & 0 deletions .github/ISSUE_TEMPLATE/1-bug_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
name: Bug report
about: Create a report to help us improve
labels: bug
---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:

**Additional context**
Add any other context about the problem here.
14 changes: 14 additions & 0 deletions .github/ISSUE_TEMPLATE/2-feature-request.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
name: Feature request
about: Suggest an idea for this project
labels: enhancement
---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Additional context**
Add any other context or screenshots about the feature request here.
79 changes: 79 additions & 0 deletions .github/labels.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
[breaking]
color = "ffcc00"
name = "breaking"
description = "Breaking change."

[bug]
color = "d73a4a"
name = "bug"
description = "Something isn't working"

[dependencies]
color = "0366d6"
name = "dependencies"
description = "Pull requests that update a dependency file"

[github_actions]
color = "000000"
name = "github_actions"
description = "Update of github actions"

[documentation]
color = "1bc4a5"
name = "documentation"
description = "Improvements or additions to documentation"

[duplicate]
color = "cfd3d7"
name = "duplicate"
description = "This issue or pull request already exists"

[enhancement]
color = "a2eeef"
name = "enhancement"
description = "New feature or request"

["good first issue"]
color = "7057ff"
name = "good first issue"
description = "Good for newcomers"

["help wanted"]
color = "008672"
name = "help wanted"
description = "Extra attention is needed"

[invalid]
color = "e4e669"
name = "invalid"
description = "This doesn't seem right"

[nochangelog]
color = "555555"
name = "nochangelog"
description = "Exclude pull requests from changelog"

[question]
color = "d876e3"
name = "question"
description = "Further information is requested"

[removed]
color = "e99695"
name = "removed"
description = "Removed piece of functionalities."

[tests]
color = "bfd4f2"
name = "tests"
description = "CI, CD and testing related changes"

[wontfix]
color = "ffffff"
name = "wontfix"
description = "This will not be worked on"

[discussion]
color = "c2e0c6"
name = "discussion"
description = "Some discussion around the project"
29 changes: 29 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: "CHANGELOG.md"

ci:
autofix_commit_msg: "chore(pre-commit.ci): auto fixes"
autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate"

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-json
- id: debug-statements
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 23.1.0
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Storage and infrastructure for affiliation data (used for augment pipeline, and

## Installation

Install via pip:
Install via pip:

```
pip install git+https://github.com/seasidesparrow/ADSAffilDB
Expand Down
6 changes: 6 additions & 0 deletions adsaffildb/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from adsputils import ADSCelery


class ADSAffilDBCelery(ADSCelery):
def __init__(self, app_name, *args, **kwargs):
ADSCelery.__init__(self, app_name, *args, **kwargs)
57 changes: 57 additions & 0 deletions adsaffildb/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
try:
from adsputils import UTCDateTime, get_date
except ImportError:
from adsmutils import get_date, UTCDateTime

from sqlalchemy import Column, Integer, String, Text
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()


class AffilData(Base):
"""
affil_data holds the mapping of published string and affiliation ID
"""

__tablename__ = "affil_data"

data_key = Column(Integer, primary_key=True, unique=True)
data_id = Column(String(6), nullable=False)
data_pubstring = Column(Text, unique=True, nullable=False)
created = Column(UTCDateTime, default=get_date)
updated = Column(UTCDateTime, onupdate=get_date)


class AffilInst(Base):
__tablename__ = "affil_inst"

inst_key = Column(Integer, primary_key=True, unique=True)
inst_id = Column(String(6), unique=True, nullable=False)
inst_parents = Column(String, nullable=True)
inst_canonical = Column(String, nullable=False)
inst_abbreviation = Column(String, nullable=False)
inst_country = Column(String, nullable=True)
# in place of location, we could consider using GeoAlchemy2 here
# especially if we can get lat-lon from ROR
inst_location = Column(String, nullable=True)
inst_rorid = Column(String, nullable=True)
inst_notes = Column(Text, nullable=True)
created = Column(UTCDateTime, default=get_date)


class AffilNorm(Base):
__tablename__ = "affil_norm"

norm_key = Column(Integer, primary_key=True, unique=True)
norm_id = Column(String(6), unique=False, nullable=False)
norm_string = Column(Text, unique=True, nullable=False)


class AffilCuration(Base):
__tablename__ = "affil_curation"

curation_key = Column(Integer, primary_key=True, unique=True)
curation_count = Column(Integer, nullable=True)
curation_id = Column(String(6), unique=False, nullable=True)
curation_string = Column(Text, unique=True, nullable=False)
74 changes: 74 additions & 0 deletions adsaffildb/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import html
import re


class FixSemicolonsException(Exception):
pass


class CleanStringException(Exception):
pass


class NormalizeStringException(Exception):
pass


class BatchNormalizeException(Exception):
pass


regex_norm_semicolon = re.compile(r";\s*;")
regex_norm_punct = re.compile(r"[-!?.,;:/\\]")


# BEGIN utils also used by ADSAffilPipeline
def fix_semicolons(string):
try:
string_x = regex_norm_semicolon.sub(";", string).strip()
if string_x != string:
return fix_semicolons(string_x)
else:
return string_x
except Exception as err:
raise FixSemicolonsException("Error in fix_semicolons: %s" % err)


def clean_string(string):
try:
string = html.unescape(string)
string = fix_semicolons(string)
string = string.strip(";").strip()
return string
except Exception as err:
raise CleanStringException("Error in clean_string: %s" % err)


def normalize_string(string):
# normalizing consists of
# 1) removing all spaces and other punctuation with re
# 2) converting all ascii chars to upper-case
try:
string = regex_norm_punct.sub(" ", string)
string = " ".join(string.split())
string = string.upper()
return string
except Exception as err:
raise NormalizeStringException("Error in normalize_string: %s" % err)


def normalize_batch(data):
try:
output = []
seen = {}
for rec in data:
newstring = rec[1]
if newstring:
newstring = normalize_string(clean_string(newstring))
if not seen.get(newstring, None):
outrec = {"norm_id": rec[0], "norm_string": newstring}
seen[newstring] = rec[0]
output.append(outrec)
return output
except Exception as err:
raise BatchNormalizeException("Failed to normalize batch: %s" % err)
65 changes: 65 additions & 0 deletions adsaffildb/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
import math
import os

from kombu import Queue
from sqlalchemy import func

from adsaffildb import app as app_module
from adsaffildb import normalize, utils
from adsaffildb.models import AffilData as affil_data
from adsaffildb.models import AffilNorm as affil_norm

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
app = app_module.ADSAffilDBCelery(
"affildb-pipeline",
proj_home=proj_home,
config=globals().get("config", {}),
local_config=globals().get("local_config", {}),
)
logger = app.logger

app.conf.CELERY_QUEUES = (Queue("normalize", app.exchange, routing_key="normalize"),)


def task_bulk_insert_data(table, data):
with app.session_scope() as session:
try:
session.bulk_insert_mappings(table, data)
session.commit()
except Exception as err:
session.rollback()
session.flush()
logger.warning("Failed to bulk insert data: %s" % err)


def task_bulk_update_data(table, data):
with app.session_scope() as session:
try:
session.bulk_update_mappings(table, data)
session.commit()
except Exception as err:
session.rollback()
session.flush()
logger.warning("Failed to bulk update data: %s" % err)


def task_normalize_affils():
with app.session_scope() as session:
try:
results = session.query(affil_data.data_id, affil_data.data_pubstring).all()
logger.warning("Results is of type %s" % type(results))
norm_results = normalize.normalize_batch(results)
except Exception as err:
logger.warning("Failed to normalize data: %s" % err)
else:
try:
session.query(affil_norm).delete()
session.commit()
logger.info("AffilNorm table cleared.")
except Exception as err:
session.rollback()
session.flush()
logger.error("Failed to clear AffilNorm table: %s" % err)
else:
task_bulk_insert_data(affil_norm, norm_results)
Loading

0 comments on commit 09d24e5

Please sign in to comment.