Merge pull request #68 from dhellmann/contributor-filter

add contributor filter
sphinx-contrib · Sep 5, 2020 · 579e9bc · 579e9bc
2 parents 4077a4c + 82fbe4a
commit 579e9bc
Show file tree

Hide file tree

Showing 7 changed files with 89 additions and 29 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -32,11 +32,11 @@
 
 spelling_word_list_filename = [
     'spelling_wordlist.txt',
-    'spelling_people.txt',
 ]
 
 spelling_show_suggestions = True
 spelling_ignore_pypi_package_names = True
+spelling_ignore_contributor_names = True
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

diff --git a/docs/source/spelling_people.txt b/docs/source/spelling_people.txt
diff --git a/releasenotes/notes/contributor-name-filter-60f6f5014b292977.yaml b/releasenotes/notes/contributor-name-filter-60f6f5014b292977.yaml
@@ -0,0 +1,8 @@
+features:
+  - |
+    Added a new filter
+    (``sphinxcontrib.spelling.filters.ContributorFilter``) that treats
+    contributor names extracted from the git history as spelled
+    correctly, making it easier to refer to the names in
+    acknowledgments . Includes a new configuration option,
+    ``spelling_ignore_contributor_names`` to enable it.
diff --git a/sphinxcontrib/spelling/__init__.py b/sphinxcontrib/spelling/__init__.py
@@ -53,6 +53,8 @@ def setup(app):
     # Assume words that look like the names of importable modules are
     # spelled properly
     app.add_config_value('spelling_ignore_importable_modules', True, 'env')
+    # Treat contributor names from git history as spelled correctly
+    app.add_config_value('spelling_ignore_contributor_names', True, 'env')
     # Add any user-defined filter classes
     app.add_config_value('spelling_filters', [], 'env')
     # Set a user-provided list of files to ignore

diff --git a/sphinxcontrib/spelling/builder.py b/sphinxcontrib/spelling/builder.py
@@ -73,6 +73,9 @@ def init(self):
         if self.config.spelling_ignore_importable_modules:
             logger.info('Ignoring importable module names')
             f.append(filters.ImportableModuleFilter)
+        if self.config.spelling_ignore_contributor_names:
+            logger.info('Ignoring contributor names')
+            f.append(filters.ContributorFilter)
         f.extend(self._load_filter_classes(self.config.spelling_filters))
 
         if not os.path.isdir(self.outdir):

diff --git a/sphinxcontrib/spelling/filters.py b/sphinxcontrib/spelling/filters.py
@@ -5,13 +5,19 @@
 """Spelling checker extension for Sphinx.
 """
 
+# TODO - Words with multiple uppercase letters treated as classes and ignored
+
 import builtins
 import imp
+import subprocess
 import xmlrpc.client as xmlrpc_client
 
-from enchant.tokenize import Filter, tokenize, unit_tokenize
+from enchant.tokenize import Filter, get_tokenizer, tokenize, unit_tokenize
+
+from sphinx.util import logging
 
-# TODO - Words with multiple uppercase letters treated as classes and ignored
+
+logger = logging.getLogger(__name__)
 
 
 class AcronymFilter(Filter):
@@ -193,3 +199,35 @@ def _skip(self, word):
                 self.found_modules.add(word)
                 return True
         return word in self.found_modules
+
+
+class ContributorFilter(IgnoreWordsFilter):
+    """Accept information about contributors as spelled correctly.
+
+    Look in the git history for authors and commiters and accept
+    tokens that are in the set.
+    """
+
+    _pretty_format = (
+        '%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn'
+    )
+
+    def __init__(self, tokenizer):
+        contributors = self._get_contributors()
+        IgnoreWordsFilter.__init__(self, tokenizer, contributors)
+
+    def _get_contributors(self):
+        logger.info('Scanning contributors')
+        cmd = [
+            'git', 'log', '--quiet', '--no-color',
+            '--pretty=format:' + self._pretty_format,
+        ]
+        try:
+            p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
+        except subprocess.CalledProcessError as err:
+            logger.warning('Called: {}'.format(' '.join(cmd)))
+            logger.warning('Failed to scan contributors: {}'.format(err))
+            return set()
+        output = p.stdout.decode('utf-8')
+        tokenizer = get_tokenizer('en_US', filters=[])
+        return set(word for word, pos in tokenizer(output))
diff --git a/sphinxcontrib/spelling/tests/test_filter.py b/sphinxcontrib/spelling/tests/test_filter.py
@@ -34,3 +34,38 @@ def test_acronym_unicode():
     f = filters.AcronymFilter(t)
     words = [w[0] for w in f(text)]
     assert u'DBM' not in words, 'Failed to filter out acronym'
+
+
+def test_contributors():
+    f = filters.ContributorFilter(None)
+    names = [
+        "Alex",
+        "Atlakson",
+        "Avram",
+        "Baumgold",
+        "Berman",
+        "Daniele",
+        "Doug",
+        "Finucane",
+        "Gaynor",
+        "Gonsiorowski",
+        "Hong",
+        "Hong",
+        "Huon",
+        "Kampik",
+        "Kolosov",
+        "Lubkin",
+        "Marti",
+        "Minhee",
+        "Olausson",
+        "Raggam",
+        "Raudsepp",
+        "sdelliot",
+        "Sergey",
+        "Sevilla",
+        "Timotheus",
+        "Tobias",
+        "Tricoli",
+    ]
+    for name in names:
+        assert f._skip(name)