Prefer exact license match over SPDX

Add a new matcher_order attribute to LicenseMatch and use it for sorting matches rather than the matcher string. This was we can ensure that there is a proper precedence between matchers when two matches are matching exactly the same text. The new sort order for matcher is like that: - 0: 1-hash - 1: 2-aho - 2: 1-spdx-id - 3: 3-seq - 4: 5-undetected - 5: 5-aho-frag - 6: 6-unknown The outcome is that a hash or aho match for the same text at the same position will take precedence of the SPDX id match, allowing to curate and correct some incorrect license expressions if needed. Reference: #3912 Reported-by: Ayan Sinha Mahapatra <[email protected]> Signed-off-by: Philippe Ombredanne <[email protected]>
aboutcode-org · Sep 12, 2024 · c581828 · c581828
1 parent 9a7df2c
commit c581828
Show file tree

Hide file tree

Showing 9 changed files with 98 additions and 25 deletions.
diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -70,6 +70,7 @@ def logger_debug(*args):
             return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
 
 MATCHER_UNDETECTED = '5-undetected'
+MATCHER_UNDETECTED_ORDER = 4
 
 # All values of match_coverage less than this value then they are not considered
 # as perfect detections
@@ -1627,6 +1628,7 @@ def get_undetected_matches(query_string):
         hispan=hispan,
         query_run_start=match_start,
         matcher=MATCHER_UNDETECTED,
+        matcher_order=MATCHER_UNDETECTED_ORDER,
         query=query_run.query,
     )
 

diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -42,8 +42,8 @@
 The filter functions are executed in a specific sequence over the list of matches.
 """
 
-TRACE = False
-TRACE_MERGE = False
+TRACE = True
+TRACE_MERGE = True
 TRACE_REFINE = False
 TRACE_FILTER_FALSE_POSITIVE = False
 TRACE_FILTER_CONTAINED = False
@@ -213,6 +213,15 @@ class LicenseMatch(object):
         )
     )
 
+    matcher_order = attr.ib(
+        default=0,
+        metadata=dict(
+            help='An integer indicating the precedence of a matcher when compared to other matchers '
+                 'where the lowest value has the highest precedence. Used to select which of two '
+                 'equal matches to keep.'
+        )
+    )
+
     start_line = attr.ib(
         default=0,
         metadata=dict(help='match start line, 1-based')
@@ -624,8 +633,10 @@ def combine(self, other):
 
         if other.matcher not in self.matcher:
             newmatcher = ' '.join([self.matcher, other.matcher])
+            newmatcher_order = max([self.matcher_order, other.matcher_order])
         else:
             newmatcher = self.matcher
+            newmatcher_order = self.matcher_order
 
         if (
             self.discard_reason == DiscardReason.NOT_DISCARDED
@@ -655,6 +666,7 @@ def combine(self, other):
             hispan=Span(self.hispan | other.hispan),
             query_run_start=min(self.query_run_start, other.query_run_start),
             matcher=newmatcher,
+            matcher_order=newmatcher_order,
             query=self.query,
             discard_reason=discard_reason,
         )
@@ -671,6 +683,7 @@ def update(self, other):
         self.matcher = combined.matcher
         self.query_run_start = min(self.query_run_start, other.query_run_start)
         self.matcher = combined.matcher
+        self.matcher_order = combined.matcher_order
         self.discard_reason = combined.discard_reason
         return self
 
@@ -852,7 +865,7 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
 
     # only merge matches with the same rule: sort then group by rule for the
     # same rule, sort on start, longer high, longer match, matcher type
-    sorter = lambda m: (m.rule.identifier, m.qspan.start, -m.hilen(), -m.len(), m.matcher)
+    sorter = lambda m: (m.rule.identifier, m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
     matches.sort(key=sorter)
     matches_by_rule = [
         (rid, list(rule_matches))
@@ -1069,7 +1082,7 @@ def filter_contained_matches(
 
     # NOTE: we do not filter matches in place: sorted creates a copy
     # sort on start, longer high, longer match, matcher type
-    sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher)
+    sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
     matches = sorted(matches, key=sorter)
     matches_pop = matches.pop
 
@@ -1190,7 +1203,7 @@ def filter_overlapping_matches(
 
     # NOTE: we do not filter matches in place: sorted creates a copy
     # sort on start, longer high, longer match, matcher type
-    sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher)
+    sorter = lambda m: (m.qspan.start, -m.hilen(), -m.len(), m.matcher_order)
     matches = sorted(matches, key=sorter)
     matches_pop = matches.pop
 
@@ -2734,6 +2747,12 @@ def _log(_matches, _discarded, msg):
     matches, discarded_contained = filter_contained_matches(matches)
     _log(matches, discarded_contained, 'NON CONTAINED')
 
+    if trace_basic:
+        logger_debug(' #####refine_matches: after FILTER matches#', len(matches))
+    if trace:
+        for m in matches:
+            logger_debug(m)
+
     matches, discarded_overlapping = filter_overlapping_matches(matches)
     _log(matches, discarded_overlapping, 'NON OVERLAPPING')
 

diff --git a/src/licensedcode/match_aho.py b/src/licensedcode/match_aho.py
@@ -76,10 +76,19 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False):
 
 
 MATCH_AHO_EXACT = '2-aho'
+MATCH_AHO_EXACT_ORDER = 2
 MATCH_AHO_FRAG = '5-aho-frag'
+MATCH_AHO_FRAG_ORDER = 5
 
 
-def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
+def exact_match(
+    idx,
+    query_run,
+    automaton,
+    matcher=MATCH_AHO_EXACT,
+    matcher_order=MATCH_AHO_EXACT_ORDER,
+    **kwargs,
+):
     """
     Return a list of exact LicenseMatch by matching the `query_run` against
     the `automaton` and `idx` index.
@@ -111,7 +120,15 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
 
         rule = rules_by_rid[rid]
         match = LicenseMatch(
-            rule, qspan, ispan, hispan, qbegin, matcher=matcher, query=query)
+            rule=rule,
+            qspan=qspan,
+            ispan=ispan,
+            hispan=hispan,
+            query_run_start=qbegin,
+            matcher=matcher,
+            matcher_order=matcher_order,
+            query=query,
+        )
         matches_append(match)
     if TRACE and matches:
         logger_debug(' ##exact_AHO: matches found#')
@@ -234,8 +251,16 @@ def match_fragments(idx, query_run):
             qspan = Span(range(qpos, qpos + mlen))
             ispan = Span(range(ipos, ipos + mlen))
             hispan = Span(p for p in ispan if itokens[p] < len_legalese)
-            match = LicenseMatch(rule, qspan, ispan, hispan, qbegin,
-                matcher=MATCH_AHO_FRAG, query=query)
+            match = LicenseMatch(
+                rule=rule,
+                qspan=qspan,
+                ispan=ispan,
+                hispan=hispan,
+                qbegin=qbegin,
+                matcher=MATCH_AHO_FRAG,
+                matcher_order=MATCH_AHO_FRAG_ORDER,
+                query=query,
+            )
             frag_matches.append(match)
 
     # Merge matches as usual

diff --git a/src/licensedcode/match_hash.py b/src/licensedcode/match_hash.py
@@ -10,7 +10,6 @@
 from array import array
 from hashlib import sha1
 
-
 from licensedcode.match import LicenseMatch
 from licensedcode.spans import Span
 
@@ -39,6 +38,7 @@ def logger_debug(*args):
         pass
 
 MATCH_HASH = '1-hash'
+MATCH_HASH_ORDER = 0
 
 
 def tokens_hash(tokens):
@@ -73,6 +73,15 @@ def hash_match(idx, query_run, **kwargs):
         qspan = Span(range(query_run.start, query_run.end + 1))
         ispan = Span(range(0, rule.length))
         hispan = Span(p for p in ispan if itokens[p] < len_legalese)
-        match = LicenseMatch(rule, qspan, ispan, hispan, query_run.start, matcher=MATCH_HASH, query=query_run.query)
+        match = LicenseMatch(
+            rule=rule,
+            qspan=qspan,
+            ispan=ispan,
+            hispan=hispan,
+            query_run_start=query_run.start,
+            matcher=MATCH_HASH,
+            matcher_order=MATCH_HASH_ORDER,
+            query=query_run.query,
+        )
         matches.append(match)
     return matches
diff --git a/src/licensedcode/match_seq.py b/src/licensedcode/match_seq.py
@@ -10,11 +10,9 @@
 from time import time
 import sys
 
-
 from licensedcode.match import LicenseMatch
 from licensedcode.spans import Span
 
-
 TRACE = False
 TRACE2 = False
 TRACE3 = False
@@ -38,17 +36,24 @@ def logger_debug(*args): pass
     def logger_debug(*args):
         return prn(' '.join(isinstance(a, str) and a or repr(a) for a in args))
 
-
 """
 Matching strategy using pair-wise multiple local sequences alignment and diff-
 like approaches.
 """
 
 MATCH_SEQ = '3-seq'
-
-
-def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
-                   match_blocks=None, deadline=sys.maxsize):
+MATCH_SEQ_ORDER = 3
+
+
+def match_sequence(
+    idx,
+    rule,
+    query_run,
+    high_postings,
+    start_offset=0,
+    match_blocks=None,
+    deadline=sys.maxsize,
+):
     """
     Return a list of LicenseMatch by matching the `query_run` tokens sequence
     starting at `start_offset` against the `idx` index for the candidate `rule`.
@@ -107,8 +112,15 @@ def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
                 ispan = Span(range(ipos, ipos + mlen))
                 hispan = Span(p for p in ispan if itokens[p] < len_legalese)
                 match = LicenseMatch(
-                    rule, qspan, ispan, hispan, qbegin,
-                    matcher=MATCH_SEQ, query=query)
+                    rule=rule,
+                    qspan=qspan,
+                    ispan=ispan,
+                    hispan=hispan,
+                    query_run_start=qbegin,
+                    matcher=MATCH_SEQ,
+                    matcher_order=MATCH_SEQ_ORDER,
+                    query=query,
+                )
                 matches.append(match)
 
                 if TRACE2:

diff --git a/src/licensedcode/match_spdx_lid.py b/src/licensedcode/match_spdx_lid.py
@@ -59,6 +59,7 @@ def logger_debug(*args):
         return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
 
 MATCH_SPDX_ID = '1-spdx-id'
+MATCH_SPDX_ID_ORDER = 2
 
 
 def spdx_id_match(idx, query_run, text, expression_symbols=None):
@@ -112,6 +113,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
         hispan=hispan,
         query_run_start=match_start,
         matcher=MATCH_SPDX_ID,
+        matcher_order=MATCH_SPDX_ID_ORDER,
         query=query_run.query,
     )
     return match
@@ -136,7 +138,7 @@ def get_spdx_expression(text, expression_symbols=None):
         expression_symbols = get_spdx_symbols()
 
     unknown_symbol = get_unknown_spdx_symbol()
-    #_prefix, exp_text = prepare_text(text)
+    # _prefix, exp_text = prepare_text(text)
 
     expression = get_expression(
         text=text,
@@ -361,7 +363,7 @@ def clean_text(text):
     if is_markup_text(text):
         text = demarkup_text(text)
 
-    dangling_markup = ['</a>','</p>','</div>', '</licenseUrl>']
+    dangling_markup = ['</a>', '</p>', '</div>', '</licenseUrl>']
     for markup in dangling_markup:
         if markup in text:
             text = text.replace(markup, '')
@@ -384,7 +386,7 @@ def clean_text(text):
     if '">' in text:
         text_fragments = text.split('">')
         if text_fragments[1] in text_fragments[0]:
-            text =  text_fragments[0]
+            text = text_fragments[0]
 
     return ' '.join(text.split())
 
@@ -393,7 +395,6 @@ def clean_text(text):
     '(spdx(?:\\-|\\s)+licen(?:s|c)e(?:\\-|\\s)+identifier\\s*:?\\s*)',
     re.IGNORECASE).split
 
-
 _nuget_split_spdx_lid = re.compile(
     '(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
     re.IGNORECASE).split

diff --git a/src/licensedcode/match_unknown.py b/src/licensedcode/match_unknown.py
@@ -44,6 +44,7 @@ def logger_debug(*args):
         pass
 
 MATCH_UNKNOWN = '6-unknown'
+MATCH_UNKNOWN_ORDER = 6
 
 UNKNOWN_NGRAM_LENGTH = 6
 
@@ -176,7 +177,7 @@ def get_tokens(_toks):
     match_len = len(qspan)
 
     if TRACE:
-        #print('match_unknowns: matched_span:', get_tokens(matched_tokens))
+        # print('match_unknowns: matched_span:', get_tokens(matched_tokens))
         print('match_unknowns: qspan, match_len, matched_span:', qspan, match_len, matched_tokens)
 
     # we use the query side to build the ispans
@@ -227,6 +228,7 @@ def get_tokens(_toks):
         hispan=hispan,
         query_run_start=query_run.start,
         matcher=MATCH_UNKNOWN,
+        matcher_order=MATCH_UNKNOWN_ORDER,
         query=query,
     )
 

diff --git a/tests/licensedcode/data/datadriven/lic4/bsd-bare-in-spdx.txt b/tests/licensedcode/data/datadriven/lic4/bsd-bare-in-spdx.txt
@@ -0,0 +1 @@
+ * SPDX-License-Identifier: (GPL-2.0+ OR BSD)
diff --git a/tests/licensedcode/data/datadriven/lic4/bsd-bare-in-spdx.txt.yml b/tests/licensedcode/data/datadriven/lic4/bsd-bare-in-spdx.txt.yml
@@ -0,0 +1,2 @@
+license_expressions:
+  - gpl-2.0-plus OR bsd-new