4
4
# SPDX-FileCopyrightText: 2022 Nico Rikken <[email protected] >
5
5
# SPDX-FileCopyrightText: 2022 Florian Snow <[email protected] >
6
6
# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <[email protected] >
7
+ # SPDX-FileCopyrightText: 2022 Pietro Albini <[email protected] >
7
8
#
8
9
# SPDX-License-Identifier: GPL-3.0-or-later
9
10
22
23
from hashlib import sha1
23
24
from os import PathLike
24
25
from pathlib import Path
25
- from typing import BinaryIO , List , Optional , Set
26
+ from typing import BinaryIO , Iterator , List , Optional , Set
26
27
27
28
from boolean .boolean import Expression , ParseError
28
29
from debian .copyright import Copyright
53
54
)
54
55
)
55
56
_IDENTIFIER_PATTERN = re .compile (
56
- r"SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN , re .MULTILINE
57
+ r"^(.*?) SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN , re .MULTILINE
57
58
)
58
59
_COPYRIGHT_PATTERNS = [
59
60
re .compile (
@@ -264,7 +265,7 @@ def extract_spdx_info(text: str) -> SpdxInfo:
264
265
:raises ParseError: if an SPDX expression could not be parsed
265
266
"""
266
267
text = filter_ignore_block (text )
267
- expression_matches = set (map ( str . strip , _IDENTIFIER_PATTERN . findall ( text ) ))
268
+ expression_matches = set (find_license_identifiers ( text ))
268
269
expressions = set ()
269
270
copyright_matches = set ()
270
271
for expression in expression_matches :
@@ -287,6 +288,28 @@ def extract_spdx_info(text: str) -> SpdxInfo:
287
288
return SpdxInfo (expressions , copyright_matches )
288
289
289
290
291
+ def find_license_identifiers (text : str ) -> Iterator [str ]:
292
+ """Extract all the license identifiers matching the IDENTIFIER_PATTERN
293
+ regex, taking care of stripping extraneous whitespace of formatting."""
294
+ for prefix , identifier in _IDENTIFIER_PATTERN .findall (text ):
295
+ prefix , identifier = prefix .strip (), identifier .strip ()
296
+
297
+ # Some comment headers have ASCII art to "frame" the comment, like this:
298
+ #
299
+ # /***********************\
300
+ # |* This is a comment *|
301
+ # \***********************/
302
+ #
303
+ # To ensure we parse them correctly, if the line ends with the inverse
304
+ # of the comment prefix, we strip that suffix. See #343 for a real
305
+ # world example of a project doing this (LLVM).
306
+ suffix = prefix [::- 1 ]
307
+ if suffix and identifier .endswith (suffix ):
308
+ identifier = identifier [: - len (suffix )]
309
+
310
+ yield identifier .strip ()
311
+
312
+
290
313
def filter_ignore_block (text : str ) -> str :
291
314
"""Filter out blocks beginning with REUSE_IGNORE_START and ending with
292
315
REUSE_IGNORE_END to remove lines that should not be treated as copyright and
0 commit comments