Skip to content

Commit 8ab6b8f

Browse files
committed
Added string comparison functions
1 parent 8c584d9 commit 8ab6b8f

4 files changed

+509
-0
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
alter FUNCTION LevenschteinDifference
3+
(
4+
@FirstString nVarchar(255), @SecondString nVarchar(255)
5+
)
6+
RETURNS int
7+
as begin
8+
Declare @PseudoMatrix table
9+
(location int identity primary key,
10+
firstorder int not null,
11+
Firstch nchar(1),
12+
secondorder int not null,
13+
Secondch nchar(1),
14+
Thevalue int not null default 0,
15+
PreviousRowValues varchar(200)
16+
)
17+
18+
insert into @PseudoMatrix (firstorder, firstch, secondorder, secondch, TheValue )
19+
SELECT TheFirst.number,TheFirst.ch, TheSecond.number,TheSecond.ch,0
20+
FROM --divide up the first string into a table of characters/sequence
21+
(SELECT number, SUBSTRING(@FirstString,number,1) AS ch
22+
FROM numbers WHERE number <= LEN(@FirstString) union all Select 0,Char(0)) TheFirst
23+
cross JOIN --divide up the second string into a table of characters/sequence
24+
(SELECT number, SUBSTRING(@SecondString,number,1) AS ch
25+
FROM numbers WHERE number <= LEN(@SecondString) union all Select 0,Char(0)) TheSecond
26+
--ON Thefirst.ch= Thesecond.ch --do all valid matches
27+
order by TheFirst.number, TheSecond.number
28+
29+
Declare @current Varchar(255)
30+
Declare @previous Varchar(255)
31+
Declare @TheValue int
32+
Declare @Deletion int, @Insertion int, @Substitution int, @minim int
33+
Select @current='', @previous=''
34+
Update @PseudoMatrix
35+
Set
36+
@Deletion=@TheValue+1,
37+
@Insertion=ascii(substring(@previous,secondorder+1,1))+1,
38+
@Substitution=ascii(substring(@previous,(secondorder),1)) +1,
39+
@minim=case when @Deletion<@Insertion then @Deletion else @insertion end,
40+
@TheValue = Thevalue = case --when Firstorder+SecondOrder=0 then 0
41+
when SecondOrder=0 then FirstOrder
42+
When FirstOrder=0 then Secondorder
43+
when FirstCh=SecondCh then ascii(substring(@previous,(secondorder),1))
44+
else case when @Minim<@Substitution then @Minim else @Substitution end
45+
end,
46+
@Previous=PreviousRowValues=case when secondorder =0 then @current else @Previous end,
47+
@current= case when secondorder =0 then char(@TheValue) else @Current+char(@TheValue) end
48+
return @TheValue
49+
End
50+
Go
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
alter function LEVENSHTEIN ( @SourceString nvarchar(100), @TargetString nvarchar(100) )
2+
--Returns the Levenshtein Distance between @SourceString string and @TargetString
3+
--Translated to TSQL by Joseph Gama
4+
--Updated slightly by Phil Factor
5+
returns int
6+
as
7+
BEGIN
8+
DECLARE @Matrix Nvarchar(4000), @LD int, @TargetStringLength int, @SourceStringLength int,
9+
@ii int, @jj int, @CurrentSourceChar nchar(1), @CurrentTargetChar nchar(1),@Cost int,
10+
@Above int,@AboveAndToLeft int,@ToTheLeft int, @MinimumValueOfCells int
11+
-- Step 1: Set n to be the length of s. Set m to be the length of t.
12+
-- If n = 0, return m and exit.
13+
-- If m = 0, return n and exit.
14+
-- Construct a matrix containing 0..m rows and 0..n columns.
15+
if @SourceString is null or @TargetString is null return null
16+
Select @SourceStringLength=LEN(@SourceString),
17+
@TargetStringLength=LEN(@TargetString),
18+
@Matrix=replicate(nchar(0),(@SourceStringLength+1)*(@TargetStringLength+1))
19+
If @SourceStringLength = 0 return @TargetStringLength
20+
If @TargetStringLength = 0 return @SourceStringLength
21+
if (@TargetStringLength+1)*(@SourceStringLength+1)> 4000 return -1
22+
--Step 2: Initialize the first row to 0..n.
23+
-- Initialize the first column to 0..m.
24+
SET @ii=0
25+
WHILE @ii<=@SourceStringLength
26+
BEGIN
27+
SET @Matrix=STUFF(@Matrix,@ii+1,1,nchar(@ii))--d(i, 0) = i
28+
SET @ii=@ii+1
29+
END
30+
SET @ii=0
31+
WHILE @ii<=@TargetStringLength
32+
BEGIN
33+
SET @Matrix=STUFF(@Matrix,@ii*(@SourceStringLength+1)+1,1,nchar(@ii))--d(0, j) = j
34+
SET @ii=@ii+1
35+
END
36+
--Step 3 Examine each character of s (i from 1 to n).
37+
SET @ii=1
38+
WHILE @ii<=@SourceStringLength
39+
BEGIN
40+
41+
--Step 4 Examine each character of t (j from 1 to m).
42+
SET @jj=1
43+
WHILE @jj<=@TargetStringLength
44+
BEGIN
45+
--Step 5 and 6
46+
Select
47+
--Set cell d[i,j] of the matrix equal to the minimum of:
48+
--a. The cell immediately above plus 1: d[i-1,j] + 1.
49+
--b. The cell immediately to the left plus 1: d[i,j-1] + 1.
50+
--c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost
51+
@Above=unicode(substring(@Matrix,@jj*(@SourceStringLength+1)+@ii-1+1,1))+1,
52+
@ToTheLeft=unicode(substring(@Matrix,(@jj-1)*(@SourceStringLength+1)+@ii+1,1))+1,
53+
@AboveAndToLeft=unicode(substring(@Matrix,(@jj-1)*(@SourceStringLength+1)+@ii-1+1,1))
54+
+ case when (substring(@SourceString,@ii,1)) = (substring(@TargetString,@jj,1))
55+
then 0 else 1 end--the cost
56+
-- If s[i] equals t[j], the cost is 0.
57+
-- If s[i] doesn't equal t[j], the cost is 1.
58+
-- now calculate the minimum value of the three
59+
if (@Above < @ToTheLeft) AND (@Above < @AboveAndToLeft)
60+
select @MinimumValueOfCells=@Above
61+
else if (@ToTheLeft < @Above) AND (@ToTheLeft < @AboveAndToLeft)
62+
select @MinimumValueOfCells=@ToTheLeft
63+
else
64+
select @MinimumValueOfCells=@AboveAndToLeft
65+
Select @Matrix=STUFF(@Matrix,
66+
@jj*(@SourceStringLength+1)+@ii+1,1,
67+
nchar(@MinimumValueOfCells)),
68+
@jj=@jj+1
69+
END
70+
SET @ii=@ii+1
71+
END
72+
--Step 7 After iteration steps (3, 4, 5, 6) are complete, distance is found in cell d[n,m]
73+
return unicode(substring(
74+
@Matrix,@SourceStringLength*(@TargetStringLength+1)+@TargetStringLength+1,1
75+
))
76+
END
77+
go
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
IF OBJECT_ID (N'LongestCommonSubsequence') IS NOT NULL
2+
DROP FUNCTION LongestCommonSubsequence
3+
GO
4+
5+
Create FUNCTION LongestCommonSubsequence
6+
/**
7+
summary: >
8+
The longest common subsequence (LCS) problem is the problem of finding the
9+
longest subsequence common to all sequences in two sequences. It differs
10+
from problems of finding common substrings: unlike substrings, subsequences
11+
are not required to occupy consecutive positions within the original
12+
sequences. For example, the sequences "1234" and "1224533324" have an LCS
13+
of "1234":
14+
Author: Phil Factor
15+
Revision: 1.0
16+
date: 05 Dec 2014
17+
example:
18+
code: |
19+
Select dbo.LongestCommonSubsequence ('1234', '1224533324')
20+
Select dbo.LongestCommonSubsequence ('thisisatest', 'testing123testing')
21+
Select dbo.LongestCommonSubsequence ( 'XMJYAUZ', 'MZJAWXU')
22+
Select dbo.LongestCommonSubsequence ( 'beginning-middle-ending',
23+
'beginning-diddle-dum-ending')
24+
returns: >
25+
the longest common subsequence as a string
26+
**/
27+
(
28+
@firstString Varchar(max),
29+
@SecondString Varchar(max)
30+
)
31+
RETURNS varchar(max)
32+
as begin
33+
34+
Declare @Array Varchar(MAX)
35+
Declare @ArrayMax int
36+
Declare @west char(1)
37+
Declare @Lines Varchar(max)
38+
Declare @ii int, @jj int, @iiMax int, @jjMax int, @index int
39+
40+
Select @iiMax=len(@FirstString), --the length of the first string
41+
@jjMax=len(@SecondString), --the length of the second string
42+
@index=@jjMax+1 --where the first new item (matrix has an extra row & column)
43+
Select @ArrayMax=(@iiMax)*(@jjMax)--total length of the array iterated
44+
Select @Array=replicate(char(0), @jjMax+1) --add in the first nought-filled row
45+
46+
Select
47+
@Index=@Index+case when (number-1) % @jjMax = 0 then 2 else 1 end, --current index
48+
@west=case when (number-1) % @jjMax = 0 then Char(0) else substring(@Array,@index-1,1) end,
49+
--as we add an extra column in, this is a dangerous operation otherwise
50+
@Array=@Array --we add a character to the string array, Char(0)--char
51+
+ case when (number-1) % @jjMax = 0 then Char(0) else '' end -- begin new row always zero
52+
+ case --check first to see if there was a match. If so, take north west number incremented
53+
when substring(@firstString,((Number-1)/@jjMax)+1,1)
54+
=substring(@SecondString, ((number-1)% @jjMax)+1 ,1)
55+
--remember to set the appropriate collation for this comparison!
56+
then Char(Ascii(substring(@Array,@index-@jjmax-2,1))+1)--increment the NW value
57+
when Ascii(substring(@Array,@index-@jjmax-1,1))>ascii(@west) --compare west to north
58+
then substring(@Array,@index-@jjmax-1,1) --and take the larger
59+
else @west-- number to west if larger
60+
end
61+
from numbers where number<=@ArrayMax
62+
63+
--Now all we need to do is to backtrack through the matrix to find the best solution
64+
Declare @commonString Varchar(max), @X_Y int
65+
Select @CommonString ='' --this contains the string
66+
Select @ii=@iimax+1,@jj=@jjmax+1
67+
while (@ii>1 and @jj>1)
68+
begin
69+
Select @X_Y = ((@ii-1)*(@jjMax+1))+@jj --the current matrix location
70+
if (substring(@firststring,@ii-1,1) = substring(@Secondstring,@jj-1,1))
71+
BEGIN --if there is a match, add the character (we'll reverse it later)
72+
Select @CommonString=@CommonString+substring(@firststring,@ii-1,1)
73+
select @jj=@jj-1, @ii=@ii-1 --go north-west
74+
end
75+
else if ascii(Substring(@Array,@X_Y,1)) = ascii(Substring(@Array, @X_Y-@jjMax-1, 1))
76+
select @ii=@ii-1
77+
ELSE if ascii(Substring(@Array,@X_Y,1)) = ascii(Substring(@Array, @X_Y-1, 1))
78+
select @jj=@jj-1
79+
else break
80+
if @@error>0 break
81+
end
82+
return Reverse(@CommonString)
83+
end
84+
go
85+
86+
Declare @timing datetime Select @Timing=GetDate()
87+
88+
if dbo.LongestCommonSubsequence ('1234', '1224533324')<>'1234'
89+
raiserror('test 1 failed',16,1)
90+
if dbo.LongestCommonSubsequence ('thisisatest', 'testing123testing')<>'tsitest'
91+
raiserror('test 2 failed',16,1)
92+
if dbo.LongestCommonSubsequence ('XMJYAUZ', 'MZJAWXU')<>'MJAU'
93+
raiserror('test 3 failed',16,1)
94+
if dbo.LongestCommonSubsequence ('yab', 'xabyrbyab')<>'yab'
95+
raiserror('test 4 failed',16,1)
96+
if dbo.LongestCommonSubsequence ('beginning-middle-ending','beginning-diddle-dum-ending')
97+
<>'beginning-iddle-ending' raiserror('test 5 failed',16,1)
98+
99+
select datediff(millisecond,@timing,GetDate()) as milliseconds

0 commit comments

Comments
 (0)