-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from seasidesparrow/main
Dev setup and testing
- Loading branch information
Showing
22 changed files
with
993 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
--- | ||
name: Bug report | ||
about: Create a report to help us improve | ||
labels: bug | ||
--- | ||
|
||
**Describe the bug** | ||
A clear and concise description of what the bug is. | ||
|
||
**To Reproduce** | ||
Steps to reproduce the behavior: | ||
|
||
**Additional context** | ||
Add any other context about the problem here. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
--- | ||
name: Feature request | ||
about: Suggest an idea for this project | ||
labels: enhancement | ||
--- | ||
|
||
**Is your feature request related to a problem? Please describe.** | ||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] | ||
|
||
**Describe the solution you'd like** | ||
A clear and concise description of what you want to happen. | ||
|
||
**Additional context** | ||
Add any other context or screenshots about the feature request here. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
[breaking] | ||
color = "ffcc00" | ||
name = "breaking" | ||
description = "Breaking change." | ||
|
||
[bug] | ||
color = "d73a4a" | ||
name = "bug" | ||
description = "Something isn't working" | ||
|
||
[dependencies] | ||
color = "0366d6" | ||
name = "dependencies" | ||
description = "Pull requests that update a dependency file" | ||
|
||
[github_actions] | ||
color = "000000" | ||
name = "github_actions" | ||
description = "Update of github actions" | ||
|
||
[documentation] | ||
color = "1bc4a5" | ||
name = "documentation" | ||
description = "Improvements or additions to documentation" | ||
|
||
[duplicate] | ||
color = "cfd3d7" | ||
name = "duplicate" | ||
description = "This issue or pull request already exists" | ||
|
||
[enhancement] | ||
color = "a2eeef" | ||
name = "enhancement" | ||
description = "New feature or request" | ||
|
||
["good first issue"] | ||
color = "7057ff" | ||
name = "good first issue" | ||
description = "Good for newcomers" | ||
|
||
["help wanted"] | ||
color = "008672" | ||
name = "help wanted" | ||
description = "Extra attention is needed" | ||
|
||
[invalid] | ||
color = "e4e669" | ||
name = "invalid" | ||
description = "This doesn't seem right" | ||
|
||
[nochangelog] | ||
color = "555555" | ||
name = "nochangelog" | ||
description = "Exclude pull requests from changelog" | ||
|
||
[question] | ||
color = "d876e3" | ||
name = "question" | ||
description = "Further information is requested" | ||
|
||
[removed] | ||
color = "e99695" | ||
name = "removed" | ||
description = "Removed piece of functionalities." | ||
|
||
[tests] | ||
color = "bfd4f2" | ||
name = "tests" | ||
description = "CI, CD and testing related changes" | ||
|
||
[wontfix] | ||
color = "ffffff" | ||
name = "wontfix" | ||
description = "This will not be worked on" | ||
|
||
[discussion] | ||
color = "c2e0c6" | ||
name = "discussion" | ||
description = "Some discussion around the project" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
exclude: "CHANGELOG.md" | ||
|
||
ci: | ||
autofix_commit_msg: "chore(pre-commit.ci): auto fixes" | ||
autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate" | ||
|
||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.4.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-yaml | ||
- id: check-json | ||
- id: debug-statements | ||
- repo: https://github.com/PyCQA/isort | ||
rev: 5.12.0 | ||
hooks: | ||
- id: isort | ||
- repo: https://github.com/psf/black | ||
rev: 23.1.0 | ||
hooks: | ||
- id: black | ||
- repo: https://github.com/PyCQA/flake8 | ||
rev: 6.0.0 | ||
hooks: | ||
- id: flake8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from adsputils import ADSCelery | ||
|
||
|
||
class ADSAffilDBCelery(ADSCelery): | ||
def __init__(self, app_name, *args, **kwargs): | ||
ADSCelery.__init__(self, app_name, *args, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
try: | ||
from adsputils import UTCDateTime, get_date | ||
except ImportError: | ||
from adsmutils import get_date, UTCDateTime | ||
|
||
from sqlalchemy import Column, Integer, String, Text | ||
from sqlalchemy.ext.declarative import declarative_base | ||
|
||
Base = declarative_base() | ||
|
||
|
||
class AffilData(Base): | ||
""" | ||
affil_data holds the mapping of published string and affiliation ID | ||
""" | ||
|
||
__tablename__ = "affil_data" | ||
|
||
data_key = Column(Integer, primary_key=True, unique=True) | ||
data_id = Column(String(6), nullable=False) | ||
data_pubstring = Column(Text, unique=True, nullable=False) | ||
created = Column(UTCDateTime, default=get_date) | ||
updated = Column(UTCDateTime, onupdate=get_date) | ||
|
||
|
||
class AffilInst(Base): | ||
__tablename__ = "affil_inst" | ||
|
||
inst_key = Column(Integer, primary_key=True, unique=True) | ||
inst_id = Column(String(6), unique=True, nullable=False) | ||
inst_parents = Column(String, nullable=True) | ||
inst_canonical = Column(String, nullable=False) | ||
inst_abbreviation = Column(String, nullable=False) | ||
inst_country = Column(String, nullable=True) | ||
# in place of location, we could consider using GeoAlchemy2 here | ||
# especially if we can get lat-lon from ROR | ||
inst_location = Column(String, nullable=True) | ||
inst_rorid = Column(String, nullable=True) | ||
inst_notes = Column(Text, nullable=True) | ||
created = Column(UTCDateTime, default=get_date) | ||
|
||
|
||
class AffilNorm(Base): | ||
__tablename__ = "affil_norm" | ||
|
||
norm_key = Column(Integer, primary_key=True, unique=True) | ||
norm_id = Column(String(6), unique=False, nullable=False) | ||
norm_string = Column(Text, unique=True, nullable=False) | ||
|
||
|
||
class AffilCuration(Base): | ||
__tablename__ = "affil_curation" | ||
|
||
curation_key = Column(Integer, primary_key=True, unique=True) | ||
curation_count = Column(Integer, nullable=True) | ||
curation_id = Column(String(6), unique=False, nullable=True) | ||
curation_string = Column(Text, unique=True, nullable=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import html | ||
import re | ||
|
||
|
||
class FixSemicolonsException(Exception): | ||
pass | ||
|
||
|
||
class CleanStringException(Exception): | ||
pass | ||
|
||
|
||
class NormalizeStringException(Exception): | ||
pass | ||
|
||
|
||
class BatchNormalizeException(Exception): | ||
pass | ||
|
||
|
||
regex_norm_semicolon = re.compile(r";\s*;") | ||
regex_norm_punct = re.compile(r"[-!?.,;:/\\]") | ||
|
||
|
||
# BEGIN utils also used by ADSAffilPipeline | ||
def fix_semicolons(string): | ||
try: | ||
string_x = regex_norm_semicolon.sub(";", string).strip() | ||
if string_x != string: | ||
return fix_semicolons(string_x) | ||
else: | ||
return string_x | ||
except Exception as err: | ||
raise FixSemicolonsException("Error in fix_semicolons: %s" % err) | ||
|
||
|
||
def clean_string(string): | ||
try: | ||
string = html.unescape(string) | ||
string = fix_semicolons(string) | ||
string = string.strip(";").strip() | ||
return string | ||
except Exception as err: | ||
raise CleanStringException("Error in clean_string: %s" % err) | ||
|
||
|
||
def normalize_string(string): | ||
# normalizing consists of | ||
# 1) removing all spaces and other punctuation with re | ||
# 2) converting all ascii chars to upper-case | ||
try: | ||
string = regex_norm_punct.sub(" ", string) | ||
string = " ".join(string.split()) | ||
string = string.upper() | ||
return string | ||
except Exception as err: | ||
raise NormalizeStringException("Error in normalize_string: %s" % err) | ||
|
||
|
||
def normalize_batch(data): | ||
try: | ||
output = [] | ||
seen = {} | ||
for rec in data: | ||
newstring = rec[1] | ||
if newstring: | ||
newstring = normalize_string(clean_string(newstring)) | ||
if not seen.get(newstring, None): | ||
outrec = {"norm_id": rec[0], "norm_string": newstring} | ||
seen[newstring] = rec[0] | ||
output.append(outrec) | ||
return output | ||
except Exception as err: | ||
raise BatchNormalizeException("Failed to normalize batch: %s" % err) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import json | ||
import math | ||
import os | ||
|
||
from kombu import Queue | ||
from sqlalchemy import func | ||
|
||
from adsaffildb import app as app_module | ||
from adsaffildb import normalize, utils | ||
from adsaffildb.models import AffilData as affil_data | ||
from adsaffildb.models import AffilNorm as affil_norm | ||
|
||
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../")) | ||
app = app_module.ADSAffilDBCelery( | ||
"affildb-pipeline", | ||
proj_home=proj_home, | ||
config=globals().get("config", {}), | ||
local_config=globals().get("local_config", {}), | ||
) | ||
logger = app.logger | ||
|
||
app.conf.CELERY_QUEUES = (Queue("normalize", app.exchange, routing_key="normalize"),) | ||
|
||
|
||
def task_bulk_insert_data(table, data): | ||
with app.session_scope() as session: | ||
try: | ||
session.bulk_insert_mappings(table, data) | ||
session.commit() | ||
except Exception as err: | ||
session.rollback() | ||
session.flush() | ||
logger.warning("Failed to bulk insert data: %s" % err) | ||
|
||
|
||
def task_bulk_update_data(table, data): | ||
with app.session_scope() as session: | ||
try: | ||
session.bulk_update_mappings(table, data) | ||
session.commit() | ||
except Exception as err: | ||
session.rollback() | ||
session.flush() | ||
logger.warning("Failed to bulk update data: %s" % err) | ||
|
||
|
||
def task_normalize_affils(): | ||
with app.session_scope() as session: | ||
try: | ||
results = session.query(affil_data.data_id, affil_data.data_pubstring).all() | ||
logger.warning("Results is of type %s" % type(results)) | ||
norm_results = normalize.normalize_batch(results) | ||
except Exception as err: | ||
logger.warning("Failed to normalize data: %s" % err) | ||
else: | ||
try: | ||
session.query(affil_norm).delete() | ||
session.commit() | ||
logger.info("AffilNorm table cleared.") | ||
except Exception as err: | ||
session.rollback() | ||
session.flush() | ||
logger.error("Failed to clear AffilNorm table: %s" % err) | ||
else: | ||
task_bulk_insert_data(affil_norm, norm_results) |
Oops, something went wrong.