Skip to content

Commit

Permalink
normalize to "copyright" and "trademark" rather than single-char vari…
Browse files Browse the repository at this point in the history
…ants

This improves matching correctness when trying to match a license file
to the text of a license which includes "copyright" verbatim in the
body of the license text, since the same normalization is not applied
to the original license texts. This includes common licenses such as
the 2- and 3-clause BSD licenses.

Signed-off-by: Tal Einat <[email protected]>
  • Loading branch information
taleinat committed May 4, 2020
1 parent 63bc934 commit 2ddf864
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
12 changes: 7 additions & 5 deletions licensedb/internal/normalize/normalize.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ var (
)

// 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable.
copyrightRe = regexp.MustCompile("copyright|\\(c\\)")
trademarkRe = regexp.MustCompile("trademark(s?)|\\(tm\\)")
copyrightRe = regexp.MustCompile("©|\\(c\\)")
trademarkRe = regexp.MustCompile("trademarks|\\(tm\\)|™")

// extra cleanup
brokenLinkRe = regexp.MustCompile("http s ://")
urlCleanupRe = regexp.MustCompile("[<(](http(s?)://[^\\s]+)[)>]")
copyrightLineRe = regexp.MustCompile("(?m)^((©.*)|(all rights reserved(\\.)?)|(li[cs]en[cs]e))\n")
copyrightLineRe = regexp.MustCompile("(?m)^((copyright.*)|(all rights reserved(\\.)?)|(li[cs]en[cs]e))\n")
nonAlphaNumRe = regexp.MustCompile("[^- \\na-z0-9]")

// used in Split()
Expand Down Expand Up @@ -128,8 +128,8 @@ func LicenseText(text string, strictness Strictness) string {
text = wordReplacer.Replace(text)

// 9. Copyright Symbol
text = copyrightRe.ReplaceAllString(text, "©")
text = trademarkRe.ReplaceAllString(text, "")
text = copyrightRe.ReplaceAllString(text, "copyright")
text = trademarkRe.ReplaceAllString(text, "trademark")

// fix broken URLs in SPDX source texts
text = brokenLinkRe.ReplaceAllString(text, "https://")
Expand All @@ -155,7 +155,9 @@ func LicenseText(text string, strictness Strictness) string {
// there are common mismatches because of trailing dots
text = strings.Replace(text, ".", "", -1)
// usually copyright lines are custom and occur multiple times
text = strings.Replace(text, "copyright notice", "PLACEHOLDER", -1)
text = copyrightLineRe.ReplaceAllString(text, "")
text = strings.Replace(text, "PLACEHOLDER", "copyright notice", -1)
}

if strictness > Moderate {
Expand Down
1 change: 1 addition & 0 deletions licensedb/internal/normalize/normalize_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ permissions granted by this license.`},
{"punctuation", "a-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-", "a-"},
{"bullet", "-\n*\n\n\n\n\n\n\n🞄\n\n\n", ""},
{"license", "", ""},
{"copyright notice", "copyright notice", "copyright notice"},
}

for _, tc := range tt {
Expand Down

0 comments on commit 2ddf864

Please sign in to comment.