Version 0.1.2

Labbeti · Oct 31, 2022 · f6b50c9 · f6b50c9
1 parent fc27b01
commit f6b50c9
Show file tree

Hide file tree

Showing 28 changed files with 354 additions and 173 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,19 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.1.2] 2022-10-31
+### Added
+- All candidates scores option `return_all_cands_scores` for SPIDEr-max.
+- Functions `is_mono_sents` and `is_mult_sents` to detect `list[str]` sentences and `list[list[str]]` multiples sentences.
+- Functions `flat_list` and `unflat_list` to flat multiples sentences to sentences.
+
+### Changed
+- Update default value used for `return_all_scores` in cider and rouge functions.
+- Update internal metric factory with functions instead of classes to avoid cyclic dependency.
+
+### Fixed
+- Fix SPIDEr-max local scores output shape.
+
 ## [0.1.1] 2022-09-30
 ### Added
 - Documentation for metric functions and classes.

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,5 @@
 
-recursive-include . *.py
+recursive-include src *.py
 global-exclude *.pyc
 global-exclude __pycache__
 

diff --git a/README.md b/README.md
@@ -14,42 +14,42 @@ Audio Captioning metrics source code, designed for Pytorch.
 </div>
 
 This package is a tool to evaluate sentences produced by automatic models to caption image or audio.
-The results of BLEU [1], ROUGE-L [2], METEOR [3], CIDEr [4], SPICE [5] and SPIDEr [6] are consistents with https://github.com/audio-captioning/caption-evaluation-tools.
+The results of BLEU [[1]](#bleu), ROUGE-L [[2]](#rouge-l), METEOR [[3]](#meteor), CIDEr-D [[4]](#cider), SPICE [[5]](#spice) and SPIDEr [[6]](#spider) are consistents with [caption-evaluation-tools](https://github.com/audio-captioning/caption-evaluation-tools).
 
 ## Why using this package?
 - Easy installation with pip
-- Consistent with audio caption metrics code https://github.com/audio-captioning/caption-evaluation-tools
+- Consistent with [caption-evaluation-tools](https://github.com/audio-captioning/caption-evaluation-tools)
 - Provides functions and classes to compute metrics separately
-- Provides SPIDEr-max metric as described in the DCASE paper [7].
+- Provides SPIDEr-max metric as described in the DCASE paper [[7]](#spider-max)
 
 ## Installation
 Install the pip package:
-```
+```bash
 pip install aac-metrics
 ```
 
 Download the external code needed for METEOR, SPICE and PTBTokenizer:
-```
+```bash
 aac-metrics-download
 ```
 
 Note: The external code for SPICE, METEOR and PTBTokenizer is stored in the cache directory (default: `$HOME/aac-metrics-cache/`)
 
 ## Metrics
-### AAC metrics
-| Metric | Origin | Range | Short description |
-|:---:|:---:|:---:|:---:|
-| BLEU [1] | machine translation | [0, 1] | Precision of n-grams |
-| ROUGE-L [2] | machine translation | [0, 1] | FScore of the longest common subsequence |
-| METEOR [3] | machine translation | [0, 1] | Cosine-similarity of frequencies |
-| CIDEr-D [4] | image captioning | [0, 10] | Cosine-similarity of TF-IDF |
-| SPICE [5] | image captioning | [0, 1] | FScore of semantic graph |
-| SPIDEr [6] | image captioning | [0, 5.5] | Mean of CIDEr-D and SPICE |
+### Default AAC metrics
+| Metric | Python Class | Origin | Range | Short description |
+|:---|:---|:---|:---|:---|
+| BLEU [[1]](#bleu) | `CocoBLEU` | machine translation | [0, 1] | Precision of n-grams |
+| ROUGE-L [[2]](#rouge-l) | `CocoRougeL` | machine translation | [0, 1] | FScore of the longest common subsequence |
+| METEOR [[3]](#meteor) | `CocoMETEOR` | machine translation | [0, 1] | Cosine-similarity of frequencies |
+| CIDEr-D [[4]](#cider) | `CocoCIDErD` | image captioning | [0, 10] | Cosine-similarity of TF-IDF |
+| SPICE [[5]](#spice) | `CocoSPICE` | image captioning | [0, 1] | FScore of semantic graph |
+| SPIDEr [[6]](#spider) | `SPIDEr` | image captioning | [0, 5.5] | Mean of CIDEr-D and SPICE |
 
 ### Other metrics
-| Metric | Origin | Range | Short description |
-|:---:|:---:|:---:|:---:|
-| SPIDEr-max [7] | audio captioning | [0, 5.5] | Max of SPIDEr scores for multiples candidates |
+| Metric name | Python Class | Origin | Range | Short description |
+|:---|:---|:---|:---|:---|
+| SPIDEr-max [[7]](#spider-max) | `SPIDErMax` | audio captioning | [0, 5.5] | Max of SPIDEr scores for multiples candidates |
 
 ## Usage
 ### Evaluate AAC metrics
@@ -68,7 +68,7 @@ print(global_scores)
 ```
 
 ### Evaluate a specific metric
-Evaluate a specific metric can be done using the `aac_metrics.functional.<metric_name>.<metric_name>` function. Unlike `aac_evaluate`, the tokenization with PTBTokenizer is not done with these functions, but you can do it before with `preprocess_mono_sents` and `preprocess_mult_sents` functions.
+Evaluate a specific metric can be done using the `aac_metrics.functional.<metric_name>.<metric_name>` function or the `aac_metrics.classes.<metric_name>.<metric_name>` class. Unlike `aac_evaluate`, the tokenization with PTBTokenizer is not done with these functions, but you can do it manually with `preprocess_mono_sents` and `preprocess_mult_sents` functions.
 
 ```python
 from aac_metrics.functional import coco_cider_d
@@ -89,8 +89,8 @@ print(local_scores)
 
 Each metrics also exists as a python class version, like `aac_metrics.classes.coco_cider_d.CocoCIDErD`.
 
-## SPIDEr-max
-SPIDEr-max [7]  is a metric based on SPIDEr that takes into account multiple candidates for the same audio. It computes the maximum of the SPIDEr scores for each candidate to balance the high sensitivity to the frequency of the words generated by the model.
+## SPIDEr-max metric
+SPIDEr-max [[7]](#spider-max)  is a metric based on SPIDEr that takes into account multiple candidates for the same audio. It computes the maximum of the SPIDEr scores for each candidate to balance the high sensitivity to the frequency of the words generated by the model.
 
 ### SPIDEr-max: why ?
 The SPIDEr metric used in audio captioning is highly sensitive to the frequencies of the words used.
@@ -176,59 +176,73 @@ Most of these functions can specify a java executable path with `java_path` argu
 
 ## Additional notes
 ### CIDEr or CIDEr-D ?
-The CIDEr [4] metric differs from CIDEr-D because it apply a stemmer to each words before computing the n-grams of the sentences. In AAC, only the CIDEr-D is reported and used for SPIDEr, but some papers called it "CIDEr".
+The CIDEr [4] metric differs from CIDEr-D because it applies a stemmer to each word before computing the n-grams of the sentences. In AAC, only the CIDEr-D is reported and used for SPIDEr, but some papers called it "CIDEr".
 
 ### Is torchmetrics needed for this package ?
 No. But if torchmetrics is installed, all metrics classes will inherit from the base class `torchmetrics.Metric`.
-It is because most of the metrics does not use PyTorch tensors to compute scores and numpy or string cannot be added to states of `torchmetrics.Metric`.
+It is because most of the metrics does not use PyTorch tensors to compute scores and numpy and strings cannot be added to states of `torchmetrics.Metric`.
+
+***Additional note*** : even when torchmetrics is installed, this package does not support multiple-gpu testing.
 
 ## References
+#### BLEU
 [1] K. Papineni, S. Roukos, T. Ward, and W.-J. Zhu, “BLEU: a
 method for automatic evaluation of machine translation,” in Proceed-
 ings of the 40th Annual Meeting on Association for Computational
 Linguistics - ACL ’02. Philadelphia, Pennsylvania: Association
 for Computational Linguistics, 2001, p. 311. [Online]. Available:
 http://portal.acm.org/citation.cfm?doid=1073083.1073135
 
+#### Rouge-L
 [2] C.-Y. Lin, “ROUGE: A package for automatic evaluation of summaries,”
 in Text Summarization Branches Out. Barcelona, Spain: Association
 for Computational Linguistics, Jul. 2004, pp. 74–81. [Online]. Available:
 https://aclanthology.org/W04-1013
 
+#### METEOR
 [3] M. Denkowski and A. Lavie, “Meteor Universal: Language Specific
 Translation Evaluation for Any Target Language,” in Proceedings of the
 Ninth Workshop on Statistical Machine Translation. Baltimore, Maryland,
 USA: Association for Computational Linguistics, 2014, pp. 376–380.
 [Online]. Available: http://aclweb.org/anthology/W14-3348
 
+#### CIDEr
 [4] R. Vedantam, C. L. Zitnick, and D. Parikh, “CIDEr: Consensus-based
 Image Description Evaluation,” arXiv:1411.5726 [cs], Jun. 2015, arXiv:
 1411.5726. [Online]. Available: http://arxiv.org/abs/1411.5726
 
+#### SPICE
 [5] P. Anderson, B. Fernando, M. Johnson, and S. Gould, “SPICE: Semantic
 Propositional Image Caption Evaluation,” arXiv:1607.08822 [cs], Jul. 2016,
 arXiv: 1607.08822. [Online]. Available: http://arxiv.org/abs/1607.08822
 
+#### SPIDEr
 [6] S. Liu, Z. Zhu, N. Ye, S. Guadarrama, and K. Murphy, “Improved Image
 Captioning via Policy Gradient optimization of SPIDEr,” 2017 IEEE Inter-
 national Conference on Computer Vision (ICCV), pp. 873–881, Oct. 2017,
 arXiv: 1612.00370. [Online]. Available: http://arxiv.org/abs/1612.00370
 
-<!-- TODO : update ref -->
-Note: the following reference is **temporary**:
-
-[7] E. Labbe, T. Pellegrini, J. Pinquier, "IS MY AUTOMATIC AUDIO CAPTIONING SYSTEM SO BAD? SPIDEr-max: A METRIC TO CONSIDER SEVERAL CAPTION CANDIDATES", DCASE2022 Workshop.
+#### SPIDEr-max
+[7] E. Labbé, T. Pellegrini, and J. Pinquier, “Is my automatic audio captioning system so bad? spider-max: a metric to consider several caption candidates,” Nov. 2022. [Online]. Available: https://hal.archives-ouvertes.fr/hal-03810396
 
 ## Cite the aac-metrics package
-The associated paper has been accepted but it will be published after the DCASE2022 workshop.
-
-If you use this code, you can cite with the following **temporary** citation:
-<!-- TODO : update citation and create CITATION.cff file -->
+If you use this code with SPIDEr-max, you can cite the following paper:
 ```
-@inproceedings{Labbe2022,
-    author = "Etienne Labbe, Thomas Pellegrini, Julien Pinquier",
-    title = "IS MY AUTOMATIC AUDIO CAPTIONING SYSTEM SO BAD? SPIDEr-max: A METRIC TO CONSIDER SEVERAL CAPTION CANDIDATES",
-    month = "November",
-    year = "2022",
+@inproceedings{labbe:hal-03810396,
+  TITLE = {{Is my automatic audio captioning system so bad? spider-max: a metric to consider several caption candidates}},
+  AUTHOR = {Labb{\'e}, Etienne and Pellegrini, Thomas and Pinquier, Julien},
+  URL = {https://hal.archives-ouvertes.fr/hal-03810396},
+  BOOKTITLE = {{Workshop DCASE}},
+  ADDRESS = {Nancy, France},
+  YEAR = {2022},
+  MONTH = Nov,
+  KEYWORDS = {audio captioning ; evaluation metric ; beam search ; multiple candidates},
+  PDF = {https://hal.archives-ouvertes.fr/hal-03810396/file/Labbe_DCASE2022.pdf},
+  HAL_ID = {hal-03810396},
+  HAL_VERSION = {v1},
 }
 ```
+
+## Contact
+Maintainer:
+- Etienne Labbé "Labbeti": [email protected]
diff --git a/install_spice.sh b/install_spice.sh
@@ -21,34 +21,35 @@ fi
 
 fname_zip="SPICE-1.0.zip"
 fpath_zip="$dpath_spice/$fname_zip"
+bn0=`basename $0`
 
-echo "[$0] Start installation of SPICE metric java code in directory \"$dpath_spice\"..."
+echo "[$bn0] Start installation of SPICE metric java code in directory \"$dpath_spice\"..."
 
 if [ ! -f "$fpath_zip" ]; then
-  echo "[$0] Zip file not found, downloading from https://panderson.me..."
+  echo "[$bn0] Zip file not found, downloading from https://panderson.me..."
   wget https://panderson.me/images/SPICE-1.0.zip -P "$dpath_spice"
 fi
 
 dpath_unzip="$dpath_spice/SPICE-1.0"
 if [ ! -d "$dpath_unzip" ]; then
-  echo "[$0] Unzipping file $dpath_zip..."
+  echo "[$bn0] Unzipping file $dpath_zip..."
   unzip $fpath_zip -d "$dpath_spice"
 
-  echo "[$0] Downloading Stanford models..."
+  echo "[$bn0] Downloading Stanford models..."
   bash $dpath_unzip/get_stanford_models.sh
 fi
 
 dpath_lib="$dpath_spice/lib"
 if [ ! -d "$dpath_lib" ]; then
-  echo "[$0] Moving lib directory to \"$dpath_spice\"..."
+  echo "[$bn0] Moving lib directory to \"$dpath_spice\"..."
   mv "$dpath_unzip/lib" "$dpath_spice"
 fi
 
 fpath_jar="$dpath_spice/spice-1.0.jar"
 if [ ! -f "$fpath_jar" ]; then
-  echo "[$0] Moving spice-1.0.jar file to \"$dpath_spice\"..."
+  echo "[$bn0] Moving spice-1.0.jar file to \"$dpath_spice\"..."
   mv "$dpath_unzip/spice-1.0.jar" "$dpath_spice"
 fi
 
-echo "[$0] SPICE metric Java code is installed."
+echo "[$bn0] SPICE metric Java code is installed."
 exit 0
diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,7 @@ install_requires =
     pyyaml>=6.0
     scikit-image>=0.19.2
     tqdm>=4.64.0
+    matplotlib==3.5.2
 
 [options.extras_require]
 dev =

diff --git a/src/aac_metrics/__init__.py b/src/aac_metrics/__init__.py
@@ -6,17 +6,18 @@
 
 __name__ = "aac-metrics"
 __author__ = "Etienne Labbé (Labbeti)"
+__author_email__ = "[email protected]"
 __license__ = "MIT"
 __maintainer__ = "Etienne Labbé (Labbeti)"
 __status__ = "Development"
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 
 
-from .functional.evaluate import aac_evaluate
 from .classes.coco_bleu import CocoBLEU
 from .classes.coco_cider_d import CocoCIDErD
 from .classes.coco_meteor import CocoMETEOR
 from .classes.coco_rouge_l import CocoRougeL
 from .classes.coco_spice import CocoSPICE
 from .classes.evaluate import AACEvaluate
 from .classes.spider import SPIDEr
+from .functional.evaluate import aac_evaluate
diff --git a/src/aac_metrics/classes/__init__.py b/src/aac_metrics/classes/__init__.py
@@ -7,5 +7,6 @@
 from .coco_meteor import CocoMETEOR
 from .coco_rouge_l import CocoRougeL
 from .coco_spice import CocoSPICE
+from .evaluate import CustomEvaluate, AACEvaluate
 from .spider_max import spider_max
 from .spider import SPIDEr
diff --git a/src/aac_metrics/classes/coco_bleu.py b/src/aac_metrics/classes/coco_bleu.py
@@ -5,12 +5,12 @@
 
 from torch import Tensor
 
+from aac_metrics.classes.base import Metric
 from aac_metrics.functional.coco_bleu import (
     BLEU_COCO_OPTIONS,
     _coco_bleu_compute,
     _coco_bleu_update,
 )
-from aac_metrics.classes.base import Metric
 
 
 class CocoBLEU(Metric):

diff --git a/src/aac_metrics/classes/coco_cider_d.py b/src/aac_metrics/classes/coco_cider_d.py
@@ -33,12 +33,14 @@ def __init__(
         n: int = 4,
         sigma: float = 6.0,
         tokenizer: Callable[[str], list[str]] = str.split,
+        return_tfidf: bool = False,
     ) -> None:
         super().__init__()
         self._return_all_scores = return_all_scores
         self._n = n
         self._sigma = sigma
         self._tokenizer = tokenizer
+        self._return_tfidf = return_tfidf
 
         self._cooked_cands = []
         self._cooked_mrefs = []
@@ -50,6 +52,7 @@ def compute(self) -> Union[tuple[dict[str, Tensor], dict[str, Tensor]], Tensor]:
             self._return_all_scores,
             self._n,
             self._sigma,
+            self._return_tfidf,
         )
 
     def reset(self) -> None: