Version 0.2.0

Labbeti · Dec 14, 2022 · dd8dff3 · dd8dff3
1 parent f6b50c9
commit dd8dff3
Show file tree

Hide file tree

Showing 43 changed files with 2,852 additions and 284 deletions.
diff --git a/.flake8 b/.flake8
@@ -19,6 +19,8 @@ exclude =
     .ipynb_checkpoints,
     # Ignore submodule caption-evaluation-tools
     tests/caption-evaluation-tools
+    # Ignore submodule fense
+    tests/fense
 per-file-ignores =
     # imported but unused
     __init__.py: F401
diff --git a/.github/workflows/python-package-pip.yaml b/.github/workflows/python-package-pip.yaml
@@ -55,7 +55,7 @@ jobs:
       uses: actions/cache@master
       id: cache_external
       with:
-        path: /home/runner/aac-metrics-cache/*
+        path: /home/runner/.cache/aac-metrics-/*
         key: ${{ runner.os }}-${{ hashFiles('install_spice.sh') }}
         restore-keys: |
           ${{ runner.os }}-

diff --git a/.gitignore b/.gitignore
@@ -132,5 +132,6 @@ dmypy.json
 .vscode/
 
 tests/caption-evaluation-tools
+tests/fense
 tmp/
 tmp*/
diff --git a/.gitmodules b/.gitmodules
@@ -2,3 +2,6 @@
 	path = tests/caption-evaluation-tools
 	url = https://github.com/audio-captioning/caption-evaluation-tools
 	branch = master
+[submodule "fense"]
+	path = tests/fense
+	url = https://github.com/blmoistawinde/fense
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.2.0] 2022-12-14
+### Added
+- `FENSE` class and function metric, with fluency error rate and raw output probabilities.
+- Unittest with `fense` repository.
+- `load_metric` function in init to match huggingface evaluation package.
+
+### Changed
+- Rename `global_scores` to `corpus_scores` and `local_scores` to `sents_scores`.
+- Rename `CustomEvaluate` to `Evaluate` and `custom_evaluate` to `evaluate`.
+- Set default cache path to `$HOME/.cache`.
+- Remove 'coco' prefix to file, functions and classes names to have cleaner names.
+
+### Fixed
+- `FENSE` metric error when computing scores with less than `batch_size` sentences.
+
 ## [0.1.2] 2022-10-31
 ### Added
 - All candidates scores option `return_all_cands_scores` for SPIDEr-max.

diff --git a/README.md b/README.md
@@ -13,14 +13,15 @@ Audio Captioning metrics source code, designed for Pytorch.
 
 </div>
 
-This package is a tool to evaluate sentences produced by automatic models to caption image or audio.
-The results of BLEU [[1]](#bleu), ROUGE-L [[2]](#rouge-l), METEOR [[3]](#meteor), CIDEr-D [[4]](#cider), SPICE [[5]](#spice) and SPIDEr [[6]](#spider) are consistents with [caption-evaluation-tools](https://github.com/audio-captioning/caption-evaluation-tools).
+This package is a tool to evaluate sentences produced by automated captioning systems.
+The results are the same than BLEU [[1]](#bleu), ROUGE-L [[2]](#rouge-l), METEOR [[3]](#meteor), CIDEr-D [[4]](#cider), SPICE [[5]](#spice) and SPIDEr [[6]](#spider) in [caption-evaluation-tools](https://github.com/audio-captioning/caption-evaluation-tools) and tahn FENSE [[8]](#fense) in [fense](https://github.com/blmoistawinde/fense).
 
 ## Why using this package?
 - Easy installation with pip
-- Consistent with [caption-evaluation-tools](https://github.com/audio-captioning/caption-evaluation-tools)
 - Provides functions and classes to compute metrics separately
-- Provides SPIDEr-max metric as described in the DCASE paper [[7]](#spider-max)
+- Returns torch tensors directly
+- Same results than [caption-evaluation-tools](https://github.com/audio-captioning/caption-evaluation-tools) and [fense](https://github.com/blmoistawinde/fense)
+- Provides SPIDEr-max [[7]](#spider-max) and FENSE [[8]](#fense) metrics
 
 ## Installation
 Install the pip package:
@@ -33,23 +34,24 @@ Download the external code needed for METEOR, SPICE and PTBTokenizer:
 aac-metrics-download
 ```
 
-Note: The external code for SPICE, METEOR and PTBTokenizer is stored in the cache directory (default: `$HOME/aac-metrics-cache/`)
+Note: The external code for SPICE, METEOR and PTBTokenizer is stored in the cache directory (default: `$HOME/.cache/aac-metrics/`)
 
 ## Metrics
 ### Default AAC metrics
 | Metric | Python Class | Origin | Range | Short description |
 |:---|:---|:---|:---|:---|
-| BLEU [[1]](#bleu) | `CocoBLEU` | machine translation | [0, 1] | Precision of n-grams |
-| ROUGE-L [[2]](#rouge-l) | `CocoRougeL` | machine translation | [0, 1] | FScore of the longest common subsequence |
-| METEOR [[3]](#meteor) | `CocoMETEOR` | machine translation | [0, 1] | Cosine-similarity of frequencies |
-| CIDEr-D [[4]](#cider) | `CocoCIDErD` | image captioning | [0, 10] | Cosine-similarity of TF-IDF |
-| SPICE [[5]](#spice) | `CocoSPICE` | image captioning | [0, 1] | FScore of semantic graph |
+| BLEU [[1]](#bleu) | `BLEU` | machine translation | [0, 1] | Precision of n-grams |
+| ROUGE-L [[2]](#rouge-l) | `ROUGEL` | machine translation | [0, 1] | FScore of the longest common subsequence |
+| METEOR [[3]](#meteor) | `METEOR` | machine translation | [0, 1] | Cosine-similarity of frequencies |
+| CIDEr-D [[4]](#cider) | `CIDErD` | image captioning | [0, 10] | Cosine-similarity of TF-IDF computed on n-grams |
+| SPICE [[5]](#spice) | `SPICE` | image captioning | [0, 1] | FScore of semantic graph |
 | SPIDEr [[6]](#spider) | `SPIDEr` | image captioning | [0, 5.5] | Mean of CIDEr-D and SPICE |
 
 ### Other metrics
 | Metric name | Python Class | Origin | Range | Short description |
 |:---|:---|:---|:---|:---|
 | SPIDEr-max [[7]](#spider-max) | `SPIDErMax` | audio captioning | [0, 5.5] | Max of SPIDEr scores for multiples candidates |
+| FENSE [[8]](#fense) | `FENSE` | audio captioning | [-1, 1] | Cosine-similarity of **Sentence-BERT embeddings** combined with fluency error detector |
 
 ## Usage
 ### Evaluate AAC metrics
@@ -61,8 +63,8 @@ from aac_metrics import aac_evaluate
 candidates: list[str] = ["a man is speaking", ...]
 mult_references: list[list[str]] = [["a man speaks.", "someone speaks.", "a man is speaking while a bird is chirping in the background"], ...]
 
-global_scores, _ = aac_evaluate(candidates, mult_references)
-print(global_scores)
+corpus_scores, _ = aac_evaluate(candidates, mult_references)
+print(corpus_scores)
 # dict containing the score of each aac metric: "bleu_1", "bleu_2", "bleu_3", "bleu_4", "rouge_l", "meteor", "cider_d", "spice", "spider"
 # {"bleu_1": tensor(0.7), "bleu_2": ..., ...}
 ```
@@ -71,7 +73,7 @@ print(global_scores)
 Evaluate a specific metric can be done using the `aac_metrics.functional.<metric_name>.<metric_name>` function or the `aac_metrics.classes.<metric_name>.<metric_name>` class. Unlike `aac_evaluate`, the tokenization with PTBTokenizer is not done with these functions, but you can do it manually with `preprocess_mono_sents` and `preprocess_mult_sents` functions.
 
 ```python
-from aac_metrics.functional import coco_cider_d
+from aac_metrics.functional import cider_d
 from aac_metrics.utils.tokenization import preprocess_mono_sents, preprocess_mult_sents
 
 candidates: list[str] = ["a man is speaking", ...]
@@ -80,27 +82,29 @@ mult_references: list[list[str]] = [["a man speaks.", "someone speaks.", "a man
 candidates = preprocess_mono_sents(candidates)
 mult_references = preprocess_mult_sents(mult_references)
 
-global_scores, local_scores = coco_cider_d(candidates, mult_references)
-print(global_scores)
+corpus_scores, sents_scores = cider_d(candidates, mult_references)
+print(corpus_scores)
 # {"cider_d": tensor(0.1)}
-print(local_scores)
+print(sents_scores)
 # {"cider_d": tensor([0.9, ...])}
 ```
 
-Each metrics also exists as a python class version, like `aac_metrics.classes.coco_cider_d.CocoCIDErD`.
+Each metrics also exists as a python class version, like `aac_metrics.classes.cider_d.CIDErD`.
 
 ## SPIDEr-max metric
 SPIDEr-max [[7]](#spider-max)  is a metric based on SPIDEr that takes into account multiple candidates for the same audio. It computes the maximum of the SPIDEr scores for each candidate to balance the high sensitivity to the frequency of the words generated by the model.
 
 ### SPIDEr-max: why ?
 The SPIDEr metric used in audio captioning is highly sensitive to the frequencies of the words used.
 
-Here is few examples of candidates and references for 2 differents audios, with their associated SPIDEr score:
+Here is 2 examples with the 5 candidates generated by the beam search algorithm, their corresponding SPIDEr scores and the associated references:
 
-| Candidates | SPIDEr |
+<center>
+
+| Beam search candidates | SPIDEr |
 |:---|:---:|
 | heavy rain is falling on a roof | 0.562 |
-| heavy rain is falling on a **tin** roof | **0.930** |
+| heavy rain is falling on **a tin roof** | **0.930** |
 | a heavy rain is falling on a roof | 0.594 |
 | a heavy rain is falling on the ground | 0.335 |
 | a heavy rain is falling on the roof | 0.594 |
@@ -109,16 +113,16 @@ Here is few examples of candidates and references for 2 differents audios, with
 |:---|
 | heavy rain falls loudly onto a structure with a thin roof |
 | heavy rainfall falling onto a thin structure with a thin roof |
-| it is raining hard and the rain hits a tin roof |
+| it is raining hard and the rain hits **a tin roof** |
 | rain that is pouring down very hard outside |
-| the hard rain is noisy as it hits a tin roof |
+| the hard rain is noisy as it hits **a tin roof** |
 
-(References for the Clotho development-testing file named "rain.wav")
+_(Candidates and references for the Clotho development-testing file named "rain.wav")_
 
-| Candidates | SPIDEr |
+| Beam search candidates | SPIDEr |
 |:---|:---:|
 | a woman speaks and a sheep bleats | 0.190 |
-| a woman speaks and a **goat** bleats | **1.259** |
+| a woman **speaks and a goat bleats** | **1.259** |
 | a man speaks and a sheep bleats | 0.344 |
 | an adult male speaks and a sheep bleats | 0.231 |
 | an adult male is speaking and a sheep bleats | 0.189 |
@@ -128,10 +132,12 @@ Here is few examples of candidates and references for 2 differents audios, with
 | a man speaking and laughing followed by a goat bleat |
 | a man is speaking in high tone while a goat is bleating one time |
 | a man speaks followed by a goat bleat |
-| a person speaks and a goat bleats |
+| a person **speaks and a goat bleats** |
 | a man is talking and snickering followed by a goat bleating |
 
-(References for an AudioCaps testing file (id: "jid4t-FzUn0"))
+_(Candidates and references for an AudioCaps testing file with the id "jid4t-FzUn0")_
+
+</center>
 
 Even with very similar candidates, the SPIDEr scores varies drastically. To adress this issue, we proposed a SPIDEr-max metric which take the maximum value of several candidates for the same audio.
 
@@ -148,22 +154,23 @@ mult_references: list[list[str]] = [["a man speaks.", "someone speaks.", "a man
 mult_candidates = preprocess_mult_sents(mult_candidates)
 mult_references = preprocess_mult_sents(mult_references)
 
-global_scores, local_scores = spider_max(mult_candidates, mult_references)
-print(global_scores)
+corpus_scores, sents_scores = spider_max(mult_candidates, mult_references)
+print(corpus_scores)
 # {"spider": tensor(0.1), ...}
-print(local_scores)
+print(sents_scores)
 # {"spider": tensor([0.9, ...]), ...}
 ```
 
 ## Requirements
 ### Python packages
 
-The requirements are automatically installed when using `pip install` on this repository.
+The pip requirements are automatically installed when using `pip install` on this repository.
 ```
 torch >= 1.10.1
 numpy >= 1.21.2
 pyyaml >= 6.0
 tqdm >= 4.64.0
+sentence-transformers>=2.2.2
 ```
 
 ### External requirements
@@ -178,12 +185,13 @@ Most of these functions can specify a java executable path with `java_path` argu
 ### CIDEr or CIDEr-D ?
 The CIDEr [4] metric differs from CIDEr-D because it applies a stemmer to each word before computing the n-grams of the sentences. In AAC, only the CIDEr-D is reported and used for SPIDEr, but some papers called it "CIDEr".
 
+### Does metric work on multi-GPU ?
+No. Most of these metrics use numpy or external java programs to run, which prevents multi-GPU testing for now.
+
 ### Is torchmetrics needed for this package ?
 No. But if torchmetrics is installed, all metrics classes will inherit from the base class `torchmetrics.Metric`.
 It is because most of the metrics does not use PyTorch tensors to compute scores and numpy and strings cannot be added to states of `torchmetrics.Metric`.
 
-***Additional note*** : even when torchmetrics is installed, this package does not support multiple-gpu testing.
-
 ## References
 #### BLEU
 [1] K. Papineni, S. Roukos, T. Ward, and W.-J. Zhu, “BLEU: a
@@ -225,8 +233,11 @@ arXiv: 1612.00370. [Online]. Available: http://arxiv.org/abs/1612.00370
 #### SPIDEr-max
 [7] E. Labbé, T. Pellegrini, and J. Pinquier, “Is my automatic audio captioning system so bad? spider-max: a metric to consider several caption candidates,” Nov. 2022. [Online]. Available: https://hal.archives-ouvertes.fr/hal-03810396
 
-## Cite the aac-metrics package
-If you use this code with SPIDEr-max, you can cite the following paper:
+#### FENSE
+[8] Z. Zhou, Z. Zhang, X. Xu, Z. Xie, M. Wu, and K. Q. Zhu, Can Audio Captions Be Evaluated with Image Caption Metrics? arXiv, 2022. [Online]. Available: http://arxiv.org/abs/2110.04684 
+
+## Citation
+If you use **SPIDEr-max**, you can cite the following paper:
 ```
 @inproceedings{labbe:hal-03810396,
   TITLE = {{Is my automatic audio captioning system so bad? spider-max: a metric to consider several caption candidates}},

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ torch>=1.10.1
 numpy>=1.21.2
 pyyaml>=6.0
 tqdm>=4.64.0
+sentence-transformers>=2.2.2
diff --git a/setup.cfg b/setup.cfg
@@ -15,6 +15,7 @@ long_description_content_type = text/markdown
 name = aac-metrics
 project_urls =
     Source = https://github.com/Labbeti/aac-metrics
+    PyPI = https://pypi.org/project/aac-metrics/
 url = https://github.com/Labbeti/aac-metrics
 version = attr: aac_metrics.__version__
 
@@ -25,18 +26,18 @@ package_dir=
 python_requires = >= 3.9
 install_requires = 
     torch>=1.10.1
-    matplotlib>=3.5.2
     numpy>=1.21.2
     pyyaml>=6.0
-    scikit-image>=0.19.2
     tqdm>=4.64.0
-    matplotlib==3.5.2
+    sentence-transformers>=2.2.2
 
 [options.extras_require]
-dev =
-  pytest
-  flake8
-  black
+dev=
+    pytest==7.1.2
+    flake8==4.0.1
+    black==22.8.0
+    scikit-image==0.19.2
+    matplotlib==3.5.2
 
 [options.packages.find]
 where=src

diff --git a/src/aac_metrics/__init__.py b/src/aac_metrics/__init__.py
@@ -10,14 +10,48 @@
 __license__ = "MIT"
 __maintainer__ = "Etienne Labbé (Labbeti)"
 __status__ = "Development"
-__version__ = "0.1.2"
+__version__ = "0.2.0"
 
 
-from .classes.coco_bleu import CocoBLEU
-from .classes.coco_cider_d import CocoCIDErD
-from .classes.coco_meteor import CocoMETEOR
-from .classes.coco_rouge_l import CocoRougeL
-from .classes.coco_spice import CocoSPICE
-from .classes.evaluate import AACEvaluate
+from .classes.base import AACMetric
+from .classes.bleu import BLEU
+from .classes.cider_d import CIDErD
+from .classes.evaluate import AACEvaluate, _get_metrics_classes_factory
+from .classes.fense import FENSE
+from .classes.meteor import METEOR
+from .classes.rouge_l import ROUGEL
+from .classes.spice import SPICE
 from .classes.spider import SPIDEr
 from .functional.evaluate import aac_evaluate
+
+
+__all__ = [
+    "BLEU",
+    "CIDErD",
+    "AACEvaluate",
+    "FENSE",
+    "METEOR",
+    "ROUGEL",
+    "SPICE",
+    "SPIDEr",
+    "aac_evaluate",
+]
+
+
+def load_metric(name: str, **kwargs) -> AACMetric:
+    """Load a metric class by name.
+
+    :param name: The name of the metric.
+        Must be one of ("bleu_1", "bleu_2", "bleu_3", "bleu_4", "meteor", "rouge_l", "cider_d", "spice", "spider", "fense").
+    :param **kwargs: The keyword optional arguments passed to the metric.
+    :returns: The Metric object built.
+    """
+    name = name.lower().strip()
+
+    factory = _get_metrics_classes_factory(**kwargs)
+    if name in factory:
+        return factory[name]()
+    else:
+        raise ValueError(
+            f"Invalid argument {name=}. (expected one of {tuple(factory.keys())})"
+        )
diff --git a/src/aac_metrics/classes/__init__.py b/src/aac_metrics/classes/__init__.py
@@ -1,12 +1,26 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from .base import Metric
-from .coco_bleu import CocoBLEU
-from .coco_cider_d import CocoCIDErD
-from .coco_meteor import CocoMETEOR
-from .coco_rouge_l import CocoRougeL
-from .coco_spice import CocoSPICE
-from .evaluate import CustomEvaluate, AACEvaluate
-from .spider_max import spider_max
+from .bleu import BLEU
+from .cider_d import CIDErD
+from .evaluate import Evaluate, AACEvaluate
+from .fense import FENSE
+from .meteor import METEOR
+from .rouge_l import ROUGEL
+from .spice import SPICE
 from .spider import SPIDEr
+from .spider_max import SPIDErMax
+
+
+__all__ = [
+    "BLEU",
+    "CIDErD",
+    "AACEvaluate",
+    "Evaluate",
+    "FENSE",
+    "METEOR",
+    "ROUGEL",
+    "SPICE",
+    "SPIDEr",
+    "SPIDErMax",
+]