Skip to content

Commit

Permalink
unicode normalization (#18)
Browse files Browse the repository at this point in the history
* unicode normalization

* refactor

* fixes
  • Loading branch information
marcovzla authored Apr 13, 2020
1 parent c5828b9 commit 70fcb3a
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 1 deletion.
3 changes: 2 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ libraryDependencies ++= Seq(
"com.typesafe" % "config" % "1.3.3",
"org.apache.commons" % "commons-lang3" % "3.9",
"org.apache.commons" % "commons-text" % "1.7",
"commons-io" % "commons-io" % "2.6"
"commons-io" % "commons-io" % "2.6",
"com.ibm.icu" % "icu4j" % "66.1",
)


Expand Down
67 changes: 67 additions & 0 deletions src/main/scala/ai/lum/common/StringUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.commons.lang3.{ StringUtils => ApacheStringUtils }
import org.apache.commons.text.WordUtils
import org.apache.commons.text.StringEscapeUtils
import org.apache.commons.text.StringSubstitutor
import com.ibm.icu.text.Normalizer2

object StringUtils {

Expand Down Expand Up @@ -189,6 +190,72 @@ object StringUtils {
/** Right pad a String with a specified String. The String is padded to the specified size. */
def rightPad(size: Int, padStr: String): String = ApacheStringUtils.rightPad(str, size, padStr)

/** Unicode normalization.
* NFC as recommended by the W3C in https://www.w3.org/TR/charmod-norm/
*/
def normalizeUnicode: String = normalizeUnicode(false, false, Map.empty, Map.empty)

/** Unicode normalization.
* NFKCCasefold, remove diacritics, replace some chars to ascii versions, collapse spaces.
*/
def normalizeUnicodeAggressively: String = normalizeUnicode(true, true, LumAICommonStringWrapper.preMapping, LumAICommonStringWrapper.postMapping)

/** Unicode normalization */
def normalizeUnicode(nfkcCasefold: Boolean, removeDiacritics: Boolean, preMapping: Map[String, String], postMapping: Map[String, String]): String = {
var result = str
// remove diacritics
if (removeDiacritics) {
result = result.stripAccents
}
// replace chars pre normalization
for ((k,v) <- preMapping) {
result = result.replaceAllLiterally(k, v)
}
// normalize
val normalizer = if (nfkcCasefold) Normalizer2.getNFKCCasefoldInstance() else Normalizer2.getNFCInstance()
result = normalizer.normalize(result)
// replace chars post normalization
for ((k,v) <- postMapping) {
result = result.replaceAllLiterally(k, v)
}
// return result
result
}

}

object LumAICommonStringWrapper {

val preMapping: Map[String, String] = Map(
"\u00ab" -> "<<", // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"\u00bb" -> ">>", // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"\u00c6" -> "AE", // LATIN CAPITAL LETTER AE
"\u00e6" -> "ae", // LATIN SMALL LETTER AE
"\u0152" -> "OE", // LATIN CAPITAL LIGATURE OE
"\u0153" -> "oe", // LATIN SMALL LIGATURE OE
"\u0192" -> "f", // LATIN SMALL LETTER F WITH HOOK
"\u02c6" -> "^", // MODIFIER LETTER CIRCUMFLEX ACCENT
"\u02dc" -> "~", // SMALL TILDE
"\u2013" -> "-", // EN DASH
"\u2014" -> "-", // EM DASH
"\u2018" -> "'", // LEFT SINGLE QUOTATION MARK
"\u2019" -> "'", // RIGHT SINGLE QUOTATION MARK
"\u201a" -> "'", // SINGLE LOW-9 QUOTATION MARK
"\u201c" -> "\"", // LEFT DOUBLE QUOTATION MARK
"\u201d" -> "\"", // RIGHT DOUBLE QUOTATION MARK
"\u201e" -> "\"", // DOUBLE LOW-9 QUOTATION MARK
"\u2020" -> "*", // DAGGER
"\u2021" -> "**", // DOUBLE DAGGER
"\u2022" -> "-", // BULLET
"\u2039" -> "<", // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
"\u203a" -> ">", // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
"\u2122" -> "(TM)", // TRADE MARK SIGN
)

val postMapping: Map[String, String] = Map(
"\u2044" -> "/", // FRACTION SLASH
)

}

}

0 comments on commit 70fcb3a

Please sign in to comment.