From 94274c3f524abb2f0a543c3d63ce973573582ab3 Mon Sep 17 00:00:00 2001 From: Ferdinand Neman Date: Thu, 28 May 2020 14:29:04 +0700 Subject: [PATCH] Initial import --- Beda.go | 290 ++++++++++++++++++++++++++++++++++++++++++++ Beda_test.go | 122 +++++++++++++++++++ CODE_OF_CONDUCTS.md | 74 +++++++++++ CONTRIBUTING.md | 32 +++++ LICENSE-2.0.txt | 202 ++++++++++++++++++++++++++++++ LICENSE.txt | 13 ++ Makefile | 14 +++ README.md | 210 ++++++++++++++++++++++++++++++++ coverage.out | 72 +++++++++++ go.mod | 3 + 10 files changed, 1032 insertions(+) create mode 100644 Beda.go create mode 100644 Beda_test.go create mode 100644 CODE_OF_CONDUCTS.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE-2.0.txt create mode 100644 LICENSE.txt create mode 100644 Makefile create mode 100644 README.md create mode 100644 coverage.out create mode 100644 go.mod diff --git a/Beda.go b/Beda.go new file mode 100644 index 0000000..6c0d881 --- /dev/null +++ b/Beda.go @@ -0,0 +1,290 @@ +package beda + +// NewStringDiff will create a new instance of StringDiff +func NewStringDiff(s1, s2 string) *StringDiff { + return &StringDiff{ + S1: s1, + S2: s2, + } +} + +// StringDiff is a utility struct to compare similarity between two string. +// +// read https://medium.com/@appaloosastore/string-similarity-algorithms-compared-3f7b4d12f0ff +type StringDiff struct { + S1 string + S2 string +} + +// LevenshteinDistance is the minimum number of single-character edits +// required to change one word into the other, so the result is a positive +// integer, sensitive to string length . +// Which make it more difficult to draw pattern. +// +// Read https://github.com/mhutter/string-similarity and +// https://en.wikipedia.org/wiki/Levenshtein_distance +func LevenshteinDistance(s1, s2 string) int { + sd := NewStringDiff(s1, s2) + return sd.LevenshteinDistance() +} + +// LevenshteinDistance is the minimum number of single-character edits +// required to change one word into the other, so the result is a positive +// integer, sensitive to string length . +// Which make it more difficult to draw pattern. +// +// Read https://github.com/mhutter/string-similarity and +// https://en.wikipedia.org/wiki/Levenshtein_distance +func (sd *StringDiff) LevenshteinDistance() int { + s := []byte(sd.S1) + t := []byte(sd.S2) + m := len(s) + n := len(t) + // for all i and j, d[i,j] will hold the Levenshtein distance between + // the first i characters of s and the first j characters of t + // note that d has (m+1)*(n+1) values + d := make([][]byte, m+1) + for i := range d { + d[i] = make([]byte, n+1) + } + // source prefixes can be transformed into empty string by + // dropping all characters + for i := 1; i <= m; i++ { + d[i][0] = byte(i) + } + // target prefixes can be reached from empty source prefix + // by inserting every character + for j := 1; j <= n; j++ { + d[0][j] = byte(j) + } + + for j := 0; j < n; j++ { + for i := 0; i < m; i++ { + var substitutionCost byte + if s[i] == t[j] { + substitutionCost = 0 + } else { + substitutionCost = 1 + } + d[i+1][j+1] = byte(minimum(int(d[i][j+1]+1), // deletion + int(d[i+1][j]+1), // insertion + int(d[i][j]+substitutionCost))) // substitution + } + } + return int(d[m][n]) +} + +type trigram struct { + Data []byte +} +type trigramuniqueset struct { + Set []*trigram +} + +func (tus *trigramuniqueset) Add(t *trigram) { + for _, c := range tus.Set { + if c.Equals(t) { + return + } + } + tus.Set = append(tus.Set, t) +} + +func (t *trigram) Equals(that *trigram) bool { + if len(t.Data) != len(that.Data) { + return false + } + for i, b := range t.Data { + if that.Data[i] != b { + return false + } + } + return true +} + +func maketrigrams(d []byte) []*trigram { + ret := make([]*trigram, 0) + if len(d) == 0 { + return ret + } + dd := make([]byte, 0) + dd = append(dd, []byte(" ")...) + dd = append(dd, d...) + dd = append(dd, []byte(" ")...) + + for i := 0; i < len(dd)-2; i++ { + tg := &trigram{Data: dd[i : i+3]} + ret = append(ret, tg) + } + return ret +} + +// TrigramCompare is a case of n-gram, a contiguous sequence +// of n (three, in this case) items from a given sample. +// In our case, an application name is a sample and a +// character is an item. +func TrigramCompare(s1, s2 string) float32 { + sd := NewStringDiff(s1, s2) + return sd.TrigramCompare() +} + +// TrigramCompare is a case of n-gram, a contiguous sequence +// of n (three, in this case) items from a given sample. +// In our case, an application name is a sample and a +// character is an item. +// +// Read https://github.com/milk1000cc/trigram/blob/master/lib/trigram.rb +// Read http://search.cpan.org/dist/String-Trigram/Trigram.pm +// Read https://en.wikipedia.org/wiki/N-gram +func (sd *StringDiff) TrigramCompare() float32 { + s := []byte(sd.S1) + t := []byte(sd.S2) + sSet := maketrigrams(s) + tSet := maketrigrams(t) + matching := 0.0 + unique := 0.0 + for _, s := range sSet { + for _, t := range tSet { + if s.Equals(t) { + matching++ + //fmt.Printf("Match '%s'\n", string(s.Data)) + } + } + } + tus := &trigramuniqueset{Set: make([]*trigram, 0)} + for _, s := range sSet { + tus.Add(s) + } + for _, t := range tSet { + tus.Add(t) + } + unique = float64(len(tus.Set)) + //fmt.Printf("Matching is %f, Unique is %f\n", matching, unique ) + return float32(matching / unique) +} + +func minimum(args ...int) int { + var min int + for i, v := range args { + if i == 0 || v < min { + min = v + } + } + return min +} + +func nonmatching(a, b []byte) int { + ret := 0 + var s, l []byte + if len(a) > len(b) { + l = a + s = b + } else { + l = b + s = a + } + ret += len(l) - len(s) + for i, ca := range s { + if l[i] != ca { + ret++ + } + } + return ret +} + +func matching(a, b []byte) int { + var s, l []byte + if len(a) > len(b) { + l = a + s = b + } else { + l = b + s = a + } + ret := 0 + for _, ca := range s { + for _, cb := range l { + if ca == cb { + ret++ + break + } + } + } + return ret +} + +// JaroDistance distance between two words is the minimum number +// of single-character transpositions required to change one word +// into the other. +func JaroDistance(s1, s2 string) float32 { + sd := NewStringDiff(s1, s2) + return sd.JaroDistance() +} + +// JaroDistance distance between two words is the minimum number +// of single-character transpositions required to change one word +// into the other. +func (sd *StringDiff) JaroDistance() float32 { + s := []byte(sd.S1) + t := []byte(sd.S2) + m := float32(matching(s, t)) + tt := float32(nonmatching(s, t)) / 2 + s1 := float32(len(s)) + s2 := float32(len(t)) + + dj := (1.0 / 3.0) * ((m / s1) + (m / s2) + ((m - tt) / m)) + + return dj +} + +// JaroWinklerDistance uses a prefix scale which gives more +// favourable ratings to strings that match from the beginning +// for a set prefix length +// +// p argument is constant scaling factor for how much the score +// is adjusted upwards for having common prefixes. +// The standard value for this constant in Winkler’s work is p=0.1 +func JaroWinklerDistance(s1, s2 string, p float32) float32 { + sd := NewStringDiff(s1, s2) + return sd.JaroWinklerDistance(p) +} + +// JaroWinklerDistance uses a prefix scale which gives more +// favourable ratings to strings that match from the beginning +// for a set prefix length +// +// p argument is constant scaling factor for how much the score +// is adjusted upwards for having common prefixes. +// The standard value for this constant in Winkler’s work is p=0.1 +// +// Read https://github.com/flori/amatch +// Read https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler +// Read https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance +func (sd *StringDiff) JaroWinklerDistance(p float32) float32 { + a := []byte(sd.S1) + b := []byte(sd.S2) + dj := sd.JaroDistance() + sim := 0 + var s, l []byte + if len(a) > len(b) { + l = a + s = b + } else { + l = b + s = a + } + for i, c := range s { + if c == l[i] { + sim++ + if sim > 4 { + break + } + } else { + break + } + } + + dw := dj + ((p * float32(sim)) * (1.0 - dj)) + + return dw +} diff --git a/Beda_test.go b/Beda_test.go new file mode 100644 index 0000000..d560e20 --- /dev/null +++ b/Beda_test.go @@ -0,0 +1,122 @@ +package beda + +import "testing" + +type TestLehvenstein struct { + S1 string + S2 string + D int +} + +func TestLevenshteinDistance(t *testing.T) { + testData := make([]*TestLehvenstein, 0) + testData = append(testData, &TestLehvenstein{ + S1: "abc", + S2: "abd", + D: 1, + }, &TestLehvenstein{ + S1: "abc", + S2: "abc", + D: 0, + }, &TestLehvenstein{ + S1: "abc", + S2: "ade", + D: 2, + }, &TestLehvenstein{ + S1: "abc", + S2: "def", + D: 3, + }, &TestLehvenstein{ + S1: "abc", + S2: "abca", + D: 1, + }, &TestLehvenstein{ + S1: "abc", + S2: "abcabc", + D: 3, + }, &TestLehvenstein{ + S1: "abc", + S2: "ab", + D: 1, + }, &TestLehvenstein{ + S1: "abc", + S2: "", + D: 3, + }) + + for _, td := range testData { + sd := NewStringDiff(td.S1, td.S2) + if sd.LevenshteinDistance() != td.D { + t.Error("Distance between", td.S1, "and", td.S2, "expected to", td.D, "but", sd.LevenshteinDistance()) + } + } +} + +type TestTrigram struct { + S1 string + S2 string + D float32 +} + +func TestTrigramCompare(t *testing.T) { + testData := make([]*TestTrigram, 0) + testData = append(testData, &TestTrigram{ + S1: "Twitter v1", + S2: "Twitter v2", + D: 0.6666667, + }, &TestTrigram{ + S1: "Twitter v1", + S2: "Twitter v1", + D: 1, + }) + for _, td := range testData { + sd := NewStringDiff(td.S1, td.S2) + if sd.TrigramCompare() != td.D { + t.Error("trigram Compare between", td.S1, "and", td.S2, "expected to", td.D, "but", sd.TrigramCompare()) + } + } +} + +type TestJaroDistancce struct { + S1 string + S2 string + DJ float32 +} + +func TestJaroDistance(t *testing.T) { + testData := make([]*TestJaroDistancce, 0) + testData = append(testData, &TestJaroDistancce{ + S1: "martha", + S2: "marhta", + DJ: 0.9444444, + }, &TestJaroDistancce{ + S1: "martha", + S2: "martha", + DJ: 1, + }) + for _, td := range testData { + sd := NewStringDiff(td.S1, td.S2) + if sd.JaroDistance() != td.DJ { + t.Error("Jaro Distance between", td.S1, "and", td.S2, "expected to", td.DJ, "but", sd.JaroDistance()) + } + } +} + +func TestJaroWinklerDistance(t *testing.T) { + testData := make([]*TestJaroDistancce, 0) + testData = append(testData, &TestJaroDistancce{ + S1: "martha", + S2: "marhta", + DJ: 0.96111107, + }, &TestJaroDistancce{ + S1: "martha", + S2: "martha", + DJ: 1, + }) + for _, td := range testData { + sd := NewStringDiff(td.S1, td.S2) + if sd.JaroWinklerDistance(0.1) != td.DJ { + t.Error("Jaro Distance between", td.S1, "and", td.S2, "expected to", td.DJ, "but", sd.JaroWinklerDistance(0.1)) + } + } +} diff --git a/CODE_OF_CONDUCTS.md b/CODE_OF_CONDUCTS.md new file mode 100644 index 0000000..d7c30db --- /dev/null +++ b/CODE_OF_CONDUCTS.md @@ -0,0 +1,74 @@ +## Code of Conduct + +### Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of experience, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +### Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +### Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +### Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +### Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at `oss@hyperjump.tech`. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +### Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..9eaff05 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing + +When contributing to this repository, please first discuss the change you wish to make via issue, +email, or any other method with the owners of this repository before making a change. + +Please note we have a code of conduct, please follow it in all your interactions with the project. + +## Fork Process + +1. Ensure that you've installed the Golang (minimum 1.13) in your system. +2. For this project into your own Github account. +3. Clone the `beda` forked repository on your account. +4. Enter the cloned directory. +5. Apply new "upstream" to original `hyperjumptech/beda` git +4. Now you can work on your account +5. Remember to pull from your upstream often. `git pull upstream master` + +## Pull Request Process + +1. Make sure you always have the most recent update from your upstream. `git pull upstream master` +2. Resolve all conflict, if any. +3. Make sure `make test` always successful (you wont be able to create pull request if this fail, circle-ci, travis-ci and azure-devops will make sure of this.) +4. Push your code to your project's master repository. +5. Create PullRequest. + * Go to `github.com/hyperjumptech/beda` + * Select `Pull Request` tab + * Click "New pull request" button + * Click "compare across fork" + * Change the source head repository from your fork and target is `hyperjumptech/beda` + * Hit the "Create pull request" button + * Fill in all necessary information to help us understand about your pull request. + diff --git a/LICENSE-2.0.txt b/LICENSE-2.0.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE-2.0.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..99ce77d --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,13 @@ +Copyright 2019 hyperjump.tech + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7afa4b4 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +GO111MODULE=on + +.PHONY: all test clean build docker + +build: + export GO111MODULE on; \ + go build ./... + +test: build + go test ./... -v -covermode=count -coverprofile=coverage.out + golint -set_exit_status . + +test-coverage: test + go tool cover -html=coverage.out \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cd73392 --- /dev/null +++ b/README.md @@ -0,0 +1,210 @@ +# BEDA + +[![Build Status](https://travis-ci.org/hyperjumptech/beda.svg?branch=master)](https://travis-ci.org/hyperjumptech/beda) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +## Get BEDA + +``` +go get github.com/hyperjumptech/beda +``` + +## Introduction + +**BEDA** is a golang library to detect differences or similarities between two words or string. +Some time you want to detect whether a string is "the same" or "somehow similar to" another string. +Suppose your system wants to detect whenever the user is putting bad-word as their user name, or +to forbid them from using unwanted words in their postings. You need to implement some, *not so easy* , +algorithm to do this task. + +**BEDA** contains implementation of algorithm for detecting word differences. They are + +1. Levenshtein Distance : A string metric for measuring the difference between two sequences. [Wikipedia](https://en.wikipedia.org/wiki/Levenshtein_distance) +2. Trigram or n-gram : A contiguous sequence of n items from a given sample of text or speech. [Wikipedia](https://en.wikipedia.org/wiki/N-gram) +3. Jaro & Jaro Winkler Distance : A string metric measuring an edit distance between two sequences. [Wikipedia](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) + +**BEDA** is an Indonesia word for "different". + +## Usage + +```go +import "github.com/hyperjumptech/beda" + +sd := beda.NewStringDiff("The First String", "The Second String") +lDist := sd.LevenshteinDistance() +tDiff := sd.TrigramCompare() +jDiff := sd.JaroDistance() +jwDiff := sd.JaroWinklerDistance(0.1) + +fmt.Printf("Levenshtein Distance is %d \n", lDist) +fmt.Printf("Trigram Compare is is %f \n", lDist) +fmt.Printf("Jaro Distance is is %d \n", jDiff) +fmt.Printf("Jaro Wingkler Distance is %d \n", jwDiff) +``` + +## Algorithms and APIs + +String comparison is not so easy. +There are a couple of algorithm to do this comparison, and each of them yield different result. +Thus may suited for one purposses compared to the other. + +To understand how and when or which algorithm should benefit your string comparisson quest, +Please read this [String similarity algorithms compared](https://medium.com/@appaloosastore/string-similarity-algorithms-compared-3f7b4d12f0ff). +Read them through, they will help you, a lot. + +```go +type StringDiff struct { + S1 string + S2 string +} +``` + +### Levenshtein Distance + +LevenshteinDistance is the minimum number of single-character edits +required to change one word into the other, so the result is a positive +integer. The algorithm is sensitive to string length. Which make it more difficult to draw pattern. + +Reading : + +- [https://github.com/mhutter/string-similarity](https://github.com/mhutter/string-similarity) +- [https://en.wikipedia.org/wiki/Levenshtein_distance](https://en.wikipedia.org/wiki/Levenshtein_distance) + +API : + +```go +func LevenshteinDistance(s1, s2 string) int +func (sd *StringDiff) LevenshteinDistance() int +``` + +`s1` is the first string to compare
+`s2` is the second string to compare
+The closer return value to 0 means the more similar the two words. + +Example : + +```go +sd := beda.NewStringDiff("abcd", "bc") +lDist := sd.LevenshteinDistance() +fmt.Printf("Distance is %d \n", lDist) // prints : Distance is 2 +``` + +or + +```go +fmt.Printf("Distance is %d \n", beda.LevenshteinDistance("abcd", "bc")) +``` + +### TriGram Compare + +TrigramCompare is a case of n-gram, a contiguous sequence of n (three, in this case) items from a given sample. +In our case, an application name is a sample and a character is an item. + +Reading: + +- [https://github.com/milk1000cc/trigram/blob/master/lib/trigram.rb](https://github.com/milk1000cc/trigram/blob/master/lib/trigram.rb) +- [http://search.cpan.org/dist/String-Trigram/Trigram.pm](http://search.cpan.org/dist/String-Trigram/Trigram.pm) +- [https://en.wikipedia.org/wiki/N-gram](https://en.wikipedia.org/wiki/N-gram) + +API : + +```go +func TrigramCompare(s1, s2 string) float32 +func (sd *StringDiff) TrigramCompare() float32 +``` + +`s1` is the first string to compare
+`s2` is the second string to compare
+The closer the result to 1 (one) means that the word is closer 100% similarities in 3 grams sequence. + +Example : + +```go +sd := beda.NewStringDiff("martha", "marhta") +diff := sd.TrigramCompare() +fmt.Printf("Differences is %f \n", diff) +``` + +or + +```go +fmt.Printf("Distance is %f \n", beda.TrigramCompare("martha", "marhta")) +``` + +### Jaro Distance + +JaroDistance distance between two words is the minimum number +of single-character transpositions required to change one word +into the other. + +API : + +```go +func JaroDistance(s1, s2 string) float32 +func (sd *StringDiff) JaroDistance() float32 +``` + +`s1` is the first string to compare
+`s2` is the second string to compare
+The closer the result to 1 (one) means that the word is closer 100% similarities + +Example : + +```go +sd := beda.NewStringDiff("martha", "marhta") +diff := sd.JaroDistance() +fmt.Printf("Differences is %f \n", diff) +``` + +or + +```go +fmt.Printf("Distance is %f \n", beda.JaroDistance("martha", "marhta")) +``` + +### Jaro Wingkler Distance + +JaroWinklerDistance uses a prefix scale which gives more +favourable ratings to strings that match from the beginning +for a set prefix length + +Reading : + +- [https://github.com/flori/amatch](https://github.com/flori/amatch) +- [https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler](https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler) +- [https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) + +API : + +```go +func JaroWinklerDistance(s1, s2 string) float32 +func (sd *StringDiff) JaroWinklerDistance(p float32) float32 +``` + +or + +```go +fmt.Printf("Distance is %f \n", beda.JaroWinklerDistance("martha", "marhta")) +``` + +`s1` is the first string to compare
+`s2` is the second string to compare
+`p` argument is constant scaling factor for how much the score is adjusted upwards for having common prefixes. +The standard value for this constant in Winkler’s work is `p = 0.1` + +The closer the result to 1 (one) means that the word is closer 100% similarities + +Example : + +```go +sd := beda.NewStringDiff("martha", "marhta") +diff := sd.JaroWinklerDistance(0.1) +fmt.Printf("Differences is %f \n", diff) +``` + +# Tasks and Help Wanted. + +Yes. We need contributor to make **BEDA** even better and useful to Open Source Community. + +If you really want to help us, simply `Fork` the project and apply for Pull Request. +Please read our [Contribution Manual](CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCTS.md) \ No newline at end of file diff --git a/coverage.out b/coverage.out new file mode 100644 index 0000000..e75cdac --- /dev/null +++ b/coverage.out @@ -0,0 +1,72 @@ +mode: count +github.com/hyperjumptech/beda/Beda.go:4.47,9.2 1 14 +github.com/hyperjumptech/beda/Beda.go:26.45,29.2 2 0 +github.com/hyperjumptech/beda/Beda.go:38.49,47.19 6 8 +github.com/hyperjumptech/beda/Beda.go:52.2,52.26 1 8 +github.com/hyperjumptech/beda/Beda.go:57.2,57.26 1 8 +github.com/hyperjumptech/beda/Beda.go:61.2,61.25 1 8 +github.com/hyperjumptech/beda/Beda.go:74.2,74.21 1 8 +github.com/hyperjumptech/beda/Beda.go:47.19,49.3 1 32 +github.com/hyperjumptech/beda/Beda.go:52.26,54.3 1 24 +github.com/hyperjumptech/beda/Beda.go:57.26,59.3 1 24 +github.com/hyperjumptech/beda/Beda.go:61.25,62.26 1 24 +github.com/hyperjumptech/beda/Beda.go:62.26,64.20 2 72 +github.com/hyperjumptech/beda/Beda.go:69.4,71.36 1 72 +github.com/hyperjumptech/beda/Beda.go:64.20,66.5 1 18 +github.com/hyperjumptech/beda/Beda.go:66.10,68.5 1 54 +github.com/hyperjumptech/beda/Beda.go:84.46,85.28 1 40 +github.com/hyperjumptech/beda/Beda.go:90.2,90.30 1 22 +github.com/hyperjumptech/beda/Beda.go:85.28,86.18 1 202 +github.com/hyperjumptech/beda/Beda.go:86.18,88.4 1 18 +github.com/hyperjumptech/beda/Beda.go:93.46,94.35 1 402 +github.com/hyperjumptech/beda/Beda.go:97.2,97.27 1 402 +github.com/hyperjumptech/beda/Beda.go:102.2,102.13 1 36 +github.com/hyperjumptech/beda/Beda.go:94.35,96.3 1 0 +github.com/hyperjumptech/beda/Beda.go:97.27,98.24 1 496 +github.com/hyperjumptech/beda/Beda.go:98.24,100.4 1 366 +github.com/hyperjumptech/beda/Beda.go:105.40,107.17 2 4 +github.com/hyperjumptech/beda/Beda.go:110.2,115.33 5 4 +github.com/hyperjumptech/beda/Beda.go:119.2,119.12 1 4 +github.com/hyperjumptech/beda/Beda.go:107.17,109.3 1 0 +github.com/hyperjumptech/beda/Beda.go:115.33,118.3 2 40 +github.com/hyperjumptech/beda/Beda.go:126.44,129.2 2 0 +github.com/hyperjumptech/beda/Beda.go:139.48,146.25 7 2 +github.com/hyperjumptech/beda/Beda.go:154.2,155.25 2 2 +github.com/hyperjumptech/beda/Beda.go:158.2,158.25 1 2 +github.com/hyperjumptech/beda/Beda.go:161.2,163.35 2 2 +github.com/hyperjumptech/beda/Beda.go:146.25,147.26 1 20 +github.com/hyperjumptech/beda/Beda.go:147.26,148.19 1 200 +github.com/hyperjumptech/beda/Beda.go:148.19,151.5 1 18 +github.com/hyperjumptech/beda/Beda.go:155.25,157.3 1 20 +github.com/hyperjumptech/beda/Beda.go:158.25,160.3 1 20 +github.com/hyperjumptech/beda/Beda.go:166.31,168.25 2 72 +github.com/hyperjumptech/beda/Beda.go:173.2,173.12 1 72 +github.com/hyperjumptech/beda/Beda.go:168.25,169.24 1 216 +github.com/hyperjumptech/beda/Beda.go:169.24,171.4 1 123 +github.com/hyperjumptech/beda/Beda.go:176.35,179.21 3 4 +github.com/hyperjumptech/beda/Beda.go:186.2,187.23 2 4 +github.com/hyperjumptech/beda/Beda.go:192.2,192.12 1 4 +github.com/hyperjumptech/beda/Beda.go:179.21,182.3 2 0 +github.com/hyperjumptech/beda/Beda.go:182.8,185.3 2 4 +github.com/hyperjumptech/beda/Beda.go:187.23,188.17 1 24 +github.com/hyperjumptech/beda/Beda.go:188.17,190.4 1 4 +github.com/hyperjumptech/beda/Beda.go:195.32,197.21 2 4 +github.com/hyperjumptech/beda/Beda.go:204.2,205.23 2 4 +github.com/hyperjumptech/beda/Beda.go:213.2,213.12 1 4 +github.com/hyperjumptech/beda/Beda.go:197.21,200.3 2 0 +github.com/hyperjumptech/beda/Beda.go:200.8,203.3 2 4 +github.com/hyperjumptech/beda/Beda.go:205.23,206.24 1 24 +github.com/hyperjumptech/beda/Beda.go:206.24,207.16 1 68 +github.com/hyperjumptech/beda/Beda.go:207.16,209.10 2 24 +github.com/hyperjumptech/beda/Beda.go:219.42,222.2 2 0 +github.com/hyperjumptech/beda/Beda.go:227.46,238.2 8 4 +github.com/hyperjumptech/beda/Beda.go:247.60,250.2 2 0 +github.com/hyperjumptech/beda/Beda.go:263.62,269.21 6 2 +github.com/hyperjumptech/beda/Beda.go:276.2,276.22 1 2 +github.com/hyperjumptech/beda/Beda.go:287.2,289.11 2 2 +github.com/hyperjumptech/beda/Beda.go:269.21,272.3 2 0 +github.com/hyperjumptech/beda/Beda.go:272.8,275.3 2 2 +github.com/hyperjumptech/beda/Beda.go:276.22,277.16 1 9 +github.com/hyperjumptech/beda/Beda.go:277.16,279.15 2 8 +github.com/hyperjumptech/beda/Beda.go:279.15,280.10 1 1 +github.com/hyperjumptech/beda/Beda.go:282.9,283.9 1 1 diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..be0da7a --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/hyperjumptech/beda + +go 1.13