Merge pull request #560 from ferrocene/pa-ascii-art-frame

carmenbianca · web-flow · commit 373f6a8371e1 · 2022-09-22T14:06:49.000+02:00
Find license identifiers in comments with ASCII art frames
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -56,6 +56,8 @@ Contributors
 
 - Yaman Qalieh
 
+- Pietro Albini <pietro.albini@ferrous-systems.com>
+
 Translators
 -----------
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -58,6 +58,8 @@ The versions follow [semantic versioning](https://semver.org).
 ### Fixed
 
 - Sanitize xargs input in scripts documentation
+- License identifiers in comments with symmetrical ASCII art frames are now
+  properly detected (#560)
 
 - In PHP files, add header after `<?php` (#543).
 
diff --git a/src/reuse/_util.py b/src/reuse/_util.py
@@ -4,6 +4,7 @@
 # SPDX-FileCopyrightText: 2022 Nico Rikken <nico.rikken@fsfe.org>
 # SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
 # SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <carmenbianca@fsfe.org>
+# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -22,7 +23,7 @@
 from hashlib import sha1
 from os import PathLike
 from pathlib import Path
-from typing import BinaryIO, List, Optional, Set
+from typing import BinaryIO, Iterator, List, Optional, Set
 
 from boolean.boolean import Expression, ParseError
 from debian.copyright import Copyright
@@ -53,7 +54,7 @@
     )
 )
 _IDENTIFIER_PATTERN = re.compile(
-    r"SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
+    r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
 )
 _COPYRIGHT_PATTERNS = [
     re.compile(
@@ -264,7 +265,7 @@ def extract_spdx_info(text: str) -> SpdxInfo:
     :raises ParseError: if an SPDX expression could not be parsed
     """
     text = filter_ignore_block(text)
-    expression_matches = set(map(str.strip, _IDENTIFIER_PATTERN.findall(text)))
+    expression_matches = set(find_license_identifiers(text))
     expressions = set()
     copyright_matches = set()
     for expression in expression_matches:
@@ -287,6 +288,28 @@ def extract_spdx_info(text: str) -> SpdxInfo:
     return SpdxInfo(expressions, copyright_matches)
 
 
+def find_license_identifiers(text: str) -> Iterator[str]:
+    """Extract all the license identifiers matching the IDENTIFIER_PATTERN
+    regex, taking care of stripping extraneous whitespace of formatting."""
+    for prefix, identifier in _IDENTIFIER_PATTERN.findall(text):
+        prefix, identifier = prefix.strip(), identifier.strip()
+
+        # Some comment headers have ASCII art to "frame" the comment, like this:
+        #
+        # /***********************\
+        # |*  This is a comment  *|
+        # \***********************/
+        #
+        # To ensure we parse them correctly, if the line ends with the inverse
+        # of the comment prefix, we strip that suffix. See #343 for a real
+        # world example of a project doing this (LLVM).
+        suffix = prefix[::-1]
+        if suffix and identifier.endswith(suffix):
+            identifier = identifier[: -len(suffix)]
+
+        yield identifier.strip()
+
+
 def filter_ignore_block(text: str) -> str:
     """Filter out blocks beginning with REUSE_IGNORE_START and ending with
     REUSE_IGNORE_END to remove lines that should not be treated as copyright and
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -3,6 +3,7 @@
 # SPDX-FileCopyrightText: 2022 Nico Rikken <nico.rikken@fsfe.org>
 # SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
 # SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <carmenbianca@fsfe.org>
+# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -51,6 +52,20 @@ def test_extract_expression():
         assert result.spdx_expressions == {_LICENSING.parse(expression)}
 
 
+def test_extract_expression_from_ascii_art_frame():
+    """Parse an expression from an ASCII art frame"""
+    result = _util.extract_spdx_info(
+        cleandoc(
+            """
+             /**********************************\\
+             |*  SPDX-License-Identifier: MIT  *|
+             \\**********************************/
+            """
+        )
+    )
+    assert result.spdx_expressions == {_LICENSING.parse("MIT")}
+
+
 def test_extract_erroneous_expression():
     """Parse an incorrect expression."""
     expression = "SPDX-License-Identifier: GPL-3.0-or-later AND (MIT OR)"

-Original file line number
+Diff line change
 - Yaman Qalieh
 +- Pietro Albini <[email protected]>
++
 Translators
 -----------