Skip to content

Commit

Permalink
Prefer exact license match over SPDX
Browse files Browse the repository at this point in the history
Add a new matcher_order attribute to LicenseMatch and use it for sorting
matches rather than the matcher string.
This was we can ensure that there is a proper precedence between
matchers when two matches are matching exactly the same text.

The new sort order for matcher is like that:
- 0: 1-hash
- 1: 2-aho
- 2: 1-spdx-id
- 3: 3-seq
- 4: 5-undetected
- 5: 5-aho-frag
- 6: 6-unknown

The outcome is that a hash or aho match for the same text at the same
position will take precedence of the SPDX id match, allowing to curate
and correct some incorrect license expressions if needed.

Reference: #3912
Reported-by: Ayan Sinha Mahapatra <[email protected]>
Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Sep 12, 2024
1 parent 9a7df2c commit c581828
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 25 deletions.
2 changes: 2 additions & 0 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCHER_UNDETECTED = '5-undetected'
MATCHER_UNDETECTED_ORDER = 4

# All values of match_coverage less than this value then they are not considered
# as perfect detections
Expand Down Expand Up @@ -1627,6 +1628,7 @@ def get_undetected_matches(query_string):
hispan=hispan,
query_run_start=match_start,
matcher=MATCHER_UNDETECTED,
matcher_order=MATCHER_UNDETECTED_ORDER,
query=query_run.query,
)

Expand Down
29 changes: 24 additions & 5 deletions src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
The filter functions are executed in a specific sequence over the list of matches.
"""

TRACE = False
TRACE_MERGE = False
TRACE = True
TRACE_MERGE = True
TRACE_REFINE = False
TRACE_FILTER_FALSE_POSITIVE = False
TRACE_FILTER_CONTAINED = False
Expand Down Expand Up @@ -213,6 +213,15 @@ class LicenseMatch(object):
)
)

matcher_order = attr.ib(
default=0,
metadata=dict(
help='An integer indicating the precedence of a matcher when compared to other matchers '
'where the lowest value has the highest precedence. Used to select which of two '
'equal matches to keep.'
)
)

start_line = attr.ib(
default=0,
metadata=dict(help='match start line, 1-based')
Expand Down Expand Up @@ -624,8 +633,10 @@ def combine(self, other):

if other.matcher not in self.matcher:
newmatcher = ' '.join([self.matcher, other.matcher])
newmatcher_order = max([self.matcher_order, other.matcher_order])
else:
newmatcher = self.matcher
newmatcher_order = self.matcher_order

if (
self.discard_reason == DiscardReason.NOT_DISCARDED
Expand Down Expand Up @@ -655,6 +666,7 @@ def combine(self, other):
hispan=Span(self.hispan | other.hispan),
query_run_start=min(self.query_run_start, other.query_run_start),
matcher=newmatcher,
matcher_order=newmatcher_order,
query=self.query,
discard_reason=discard_reason,
)
Expand All @@ -671,6 +683,7 @@ def update(self, other):
self.matcher = combined.matcher
self.query_run_start = min(self.query_run_start, other.query_run_start)
self.matcher = combined.matcher
self.matcher_order = combined.matcher_order
self.discard_reason = combined.discard_reason
return self

Expand Down Expand Up @@ -852,7 +865,7 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):

# only merge matches with the same rule: sort then group by rule for the
# same rule, sort on start, longer high, longer match, matcher type
sorter = lambda m: (m.rule.identifier, m.qspan.start, -m.hilen(), -m.len(), m.matcher)
sorter = lambda m: (m.rule.identifier, m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
matches.sort(key=sorter)
matches_by_rule = [
(rid, list(rule_matches))
Expand Down Expand Up @@ -1069,7 +1082,7 @@ def filter_contained_matches(

# NOTE: we do not filter matches in place: sorted creates a copy
# sort on start, longer high, longer match, matcher type
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher)
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
matches = sorted(matches, key=sorter)
matches_pop = matches.pop

Expand Down Expand Up @@ -1190,7 +1203,7 @@ def filter_overlapping_matches(

# NOTE: we do not filter matches in place: sorted creates a copy
# sort on start, longer high, longer match, matcher type
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher)
sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
matches = sorted(matches, key=sorter)
matches_pop = matches.pop

Expand Down Expand Up @@ -2734,6 +2747,12 @@ def _log(_matches, _discarded, msg):
matches, discarded_contained = filter_contained_matches(matches)
_log(matches, discarded_contained, 'NON CONTAINED')

if trace_basic:
logger_debug(' #####refine_matches: after FILTER matches#', len(matches))
if trace:
for m in matches:
logger_debug(m)

matches, discarded_overlapping = filter_overlapping_matches(matches)
_log(matches, discarded_overlapping, 'NON OVERLAPPING')

Expand Down
33 changes: 29 additions & 4 deletions src/licensedcode/match_aho.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,19 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False):


MATCH_AHO_EXACT = '2-aho'
MATCH_AHO_EXACT_ORDER = 2
MATCH_AHO_FRAG = '5-aho-frag'
MATCH_AHO_FRAG_ORDER = 5


def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
def exact_match(
idx,
query_run,
automaton,
matcher=MATCH_AHO_EXACT,
matcher_order=MATCH_AHO_EXACT_ORDER,
**kwargs,
):
"""
Return a list of exact LicenseMatch by matching the `query_run` against
the `automaton` and `idx` index.
Expand Down Expand Up @@ -111,7 +120,15 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):

rule = rules_by_rid[rid]
match = LicenseMatch(
rule, qspan, ispan, hispan, qbegin, matcher=matcher, query=query)
rule=rule,
qspan=qspan,
ispan=ispan,
hispan=hispan,
query_run_start=qbegin,
matcher=matcher,
matcher_order=matcher_order,
query=query,
)
matches_append(match)
if TRACE and matches:
logger_debug(' ##exact_AHO: matches found#')
Expand Down Expand Up @@ -234,8 +251,16 @@ def match_fragments(idx, query_run):
qspan = Span(range(qpos, qpos + mlen))
ispan = Span(range(ipos, ipos + mlen))
hispan = Span(p for p in ispan if itokens[p] < len_legalese)
match = LicenseMatch(rule, qspan, ispan, hispan, qbegin,
matcher=MATCH_AHO_FRAG, query=query)
match = LicenseMatch(
rule=rule,
qspan=qspan,
ispan=ispan,
hispan=hispan,
qbegin=qbegin,
matcher=MATCH_AHO_FRAG,
matcher_order=MATCH_AHO_FRAG_ORDER,
query=query,
)
frag_matches.append(match)

# Merge matches as usual
Expand Down
13 changes: 11 additions & 2 deletions src/licensedcode/match_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from array import array
from hashlib import sha1


from licensedcode.match import LicenseMatch
from licensedcode.spans import Span

Expand Down Expand Up @@ -39,6 +38,7 @@ def logger_debug(*args):
pass

MATCH_HASH = '1-hash'
MATCH_HASH_ORDER = 0


def tokens_hash(tokens):
Expand Down Expand Up @@ -73,6 +73,15 @@ def hash_match(idx, query_run, **kwargs):
qspan = Span(range(query_run.start, query_run.end + 1))
ispan = Span(range(0, rule.length))
hispan = Span(p for p in ispan if itokens[p] < len_legalese)
match = LicenseMatch(rule, qspan, ispan, hispan, query_run.start, matcher=MATCH_HASH, query=query_run.query)
match = LicenseMatch(
rule=rule,
qspan=qspan,
ispan=ispan,
hispan=hispan,
query_run_start=query_run.start,
matcher=MATCH_HASH,
matcher_order=MATCH_HASH_ORDER,
query=query_run.query,
)
matches.append(match)
return matches
30 changes: 21 additions & 9 deletions src/licensedcode/match_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@
from time import time
import sys


from licensedcode.match import LicenseMatch
from licensedcode.spans import Span


TRACE = False
TRACE2 = False
TRACE3 = False
Expand All @@ -38,17 +36,24 @@ def logger_debug(*args): pass
def logger_debug(*args):
return prn(' '.join(isinstance(a, str) and a or repr(a) for a in args))


"""
Matching strategy using pair-wise multiple local sequences alignment and diff-
like approaches.
"""

MATCH_SEQ = '3-seq'


def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
match_blocks=None, deadline=sys.maxsize):
MATCH_SEQ_ORDER = 3


def match_sequence(
idx,
rule,
query_run,
high_postings,
start_offset=0,
match_blocks=None,
deadline=sys.maxsize,
):
"""
Return a list of LicenseMatch by matching the `query_run` tokens sequence
starting at `start_offset` against the `idx` index for the candidate `rule`.
Expand Down Expand Up @@ -107,8 +112,15 @@ def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
ispan = Span(range(ipos, ipos + mlen))
hispan = Span(p for p in ispan if itokens[p] < len_legalese)
match = LicenseMatch(
rule, qspan, ispan, hispan, qbegin,
matcher=MATCH_SEQ, query=query)
rule=rule,
qspan=qspan,
ispan=ispan,
hispan=hispan,
query_run_start=qbegin,
matcher=MATCH_SEQ,
matcher_order=MATCH_SEQ_ORDER,
query=query,
)
matches.append(match)

if TRACE2:
Expand Down
9 changes: 5 additions & 4 deletions src/licensedcode/match_spdx_lid.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCH_SPDX_ID = '1-spdx-id'
MATCH_SPDX_ID_ORDER = 2


def spdx_id_match(idx, query_run, text, expression_symbols=None):
Expand Down Expand Up @@ -112,6 +113,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
hispan=hispan,
query_run_start=match_start,
matcher=MATCH_SPDX_ID,
matcher_order=MATCH_SPDX_ID_ORDER,
query=query_run.query,
)
return match
Expand All @@ -136,7 +138,7 @@ def get_spdx_expression(text, expression_symbols=None):
expression_symbols = get_spdx_symbols()

unknown_symbol = get_unknown_spdx_symbol()
#_prefix, exp_text = prepare_text(text)
# _prefix, exp_text = prepare_text(text)

expression = get_expression(
text=text,
Expand Down Expand Up @@ -361,7 +363,7 @@ def clean_text(text):
if is_markup_text(text):
text = demarkup_text(text)

dangling_markup = ['</a>','</p>','</div>', '</licenseUrl>']
dangling_markup = ['</a>', '</p>', '</div>', '</licenseUrl>']
for markup in dangling_markup:
if markup in text:
text = text.replace(markup, '')
Expand All @@ -384,7 +386,7 @@ def clean_text(text):
if '">' in text:
text_fragments = text.split('">')
if text_fragments[1] in text_fragments[0]:
text = text_fragments[0]
text = text_fragments[0]

return ' '.join(text.split())

Expand All @@ -393,7 +395,6 @@ def clean_text(text):
'(spdx(?:\\-|\\s)+licen(?:s|c)e(?:\\-|\\s)+identifier\\s*:?\\s*)',
re.IGNORECASE).split


_nuget_split_spdx_lid = re.compile(
'(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
re.IGNORECASE).split
Expand Down
4 changes: 3 additions & 1 deletion src/licensedcode/match_unknown.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def logger_debug(*args):
pass

MATCH_UNKNOWN = '6-unknown'
MATCH_UNKNOWN_ORDER = 6

UNKNOWN_NGRAM_LENGTH = 6

Expand Down Expand Up @@ -176,7 +177,7 @@ def get_tokens(_toks):
match_len = len(qspan)

if TRACE:
#print('match_unknowns: matched_span:', get_tokens(matched_tokens))
# print('match_unknowns: matched_span:', get_tokens(matched_tokens))
print('match_unknowns: qspan, match_len, matched_span:', qspan, match_len, matched_tokens)

# we use the query side to build the ispans
Expand Down Expand Up @@ -227,6 +228,7 @@ def get_tokens(_toks):
hispan=hispan,
query_run_start=query_run.start,
matcher=MATCH_UNKNOWN,
matcher_order=MATCH_UNKNOWN_ORDER,
query=query,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* SPDX-License-Identifier: (GPL-2.0+ OR BSD)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
license_expressions:
- gpl-2.0-plus OR bsd-new

0 comments on commit c581828

Please sign in to comment.