Skip to content

Commit 373f6a8

Browse files
authored
Merge pull request #560 from ferrocene/pa-ascii-art-frame
Find license identifiers in comments with ASCII art frames
2 parents 18624ad + ffe9692 commit 373f6a8

File tree

4 files changed

+45
-3
lines changed

4 files changed

+45
-3
lines changed

AUTHORS.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ Contributors
5656

5757
- Yaman Qalieh
5858

59+
- Pietro Albini <[email protected]>
60+
5961
Translators
6062
-----------
6163

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ The versions follow [semantic versioning](https://semver.org).
5858
### Fixed
5959

6060
- Sanitize xargs input in scripts documentation
61+
- License identifiers in comments with symmetrical ASCII art frames are now
62+
properly detected (#560)
6163

6264
- In PHP files, add header after `<?php` (#543).
6365

src/reuse/_util.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# SPDX-FileCopyrightText: 2022 Nico Rikken <[email protected]>
55
# SPDX-FileCopyrightText: 2022 Florian Snow <[email protected]>
66
# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <[email protected]>
7+
# SPDX-FileCopyrightText: 2022 Pietro Albini <[email protected]>
78
#
89
# SPDX-License-Identifier: GPL-3.0-or-later
910

@@ -22,7 +23,7 @@
2223
from hashlib import sha1
2324
from os import PathLike
2425
from pathlib import Path
25-
from typing import BinaryIO, List, Optional, Set
26+
from typing import BinaryIO, Iterator, List, Optional, Set
2627

2728
from boolean.boolean import Expression, ParseError
2829
from debian.copyright import Copyright
@@ -53,7 +54,7 @@
5354
)
5455
)
5556
_IDENTIFIER_PATTERN = re.compile(
56-
r"SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
57+
r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
5758
)
5859
_COPYRIGHT_PATTERNS = [
5960
re.compile(
@@ -264,7 +265,7 @@ def extract_spdx_info(text: str) -> SpdxInfo:
264265
:raises ParseError: if an SPDX expression could not be parsed
265266
"""
266267
text = filter_ignore_block(text)
267-
expression_matches = set(map(str.strip, _IDENTIFIER_PATTERN.findall(text)))
268+
expression_matches = set(find_license_identifiers(text))
268269
expressions = set()
269270
copyright_matches = set()
270271
for expression in expression_matches:
@@ -287,6 +288,28 @@ def extract_spdx_info(text: str) -> SpdxInfo:
287288
return SpdxInfo(expressions, copyright_matches)
288289

289290

291+
def find_license_identifiers(text: str) -> Iterator[str]:
292+
"""Extract all the license identifiers matching the IDENTIFIER_PATTERN
293+
regex, taking care of stripping extraneous whitespace of formatting."""
294+
for prefix, identifier in _IDENTIFIER_PATTERN.findall(text):
295+
prefix, identifier = prefix.strip(), identifier.strip()
296+
297+
# Some comment headers have ASCII art to "frame" the comment, like this:
298+
#
299+
# /***********************\
300+
# |* This is a comment *|
301+
# \***********************/
302+
#
303+
# To ensure we parse them correctly, if the line ends with the inverse
304+
# of the comment prefix, we strip that suffix. See #343 for a real
305+
# world example of a project doing this (LLVM).
306+
suffix = prefix[::-1]
307+
if suffix and identifier.endswith(suffix):
308+
identifier = identifier[: -len(suffix)]
309+
310+
yield identifier.strip()
311+
312+
290313
def filter_ignore_block(text: str) -> str:
291314
"""Filter out blocks beginning with REUSE_IGNORE_START and ending with
292315
REUSE_IGNORE_END to remove lines that should not be treated as copyright and

tests/test_util.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# SPDX-FileCopyrightText: 2022 Nico Rikken <[email protected]>
44
# SPDX-FileCopyrightText: 2022 Florian Snow <[email protected]>
55
# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <[email protected]>
6+
# SPDX-FileCopyrightText: 2022 Pietro Albini <[email protected]>
67
#
78
# SPDX-License-Identifier: GPL-3.0-or-later
89

@@ -51,6 +52,20 @@ def test_extract_expression():
5152
assert result.spdx_expressions == {_LICENSING.parse(expression)}
5253

5354

55+
def test_extract_expression_from_ascii_art_frame():
56+
"""Parse an expression from an ASCII art frame"""
57+
result = _util.extract_spdx_info(
58+
cleandoc(
59+
"""
60+
/**********************************\\
61+
|* SPDX-License-Identifier: MIT *|
62+
\\**********************************/
63+
"""
64+
)
65+
)
66+
assert result.spdx_expressions == {_LICENSING.parse("MIT")}
67+
68+
5469
def test_extract_erroneous_expression():
5570
"""Parse an incorrect expression."""
5671
expression = "SPDX-License-Identifier: GPL-3.0-or-later AND (MIT OR)"

0 commit comments

Comments
 (0)