Skip to content

Commit 2832803

Browse files
committed
Add dbo.udf_LongestCommonSubstring table function
1 parent 5d8d738 commit 2832803

File tree

1 file changed

+129
-0
lines changed

1 file changed

+129
-0
lines changed
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
--This will need a NUMBERS table, stocked with numbers. If you haven't got one
2+
--this will create it automatically
3+
IF NOT EXISTS (SELECT 1 FROM information_Schema.Tables
4+
WHERE table_name='Numbers')
5+
BEGIN
6+
CREATE TABLE [dbo].[Numbers]
7+
(
8+
[number] [int],
9+
CONSTRAINT [Index_Numbers] PRIMARY KEY CLUSTERED ([number] ASC)
10+
ON [PRIMARY]
11+
)
12+
ON [PRIMARY]
13+
END
14+
IF NOT EXISTS(SELECT 1 FROM numbers WHERE number=99999)
15+
BEGIN
16+
TRUNCATE TABLE numbers
17+
;WITH Digits(i) AS
18+
(SELECT i
19+
FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (0)) AS X(i))
20+
INSERT INTO numbers(number)
21+
SELECT (D6.i*1000000 +D5.i*100000 + D4.i*10000 + D3.i * 1000 + D2.i * 100
22+
+ D1.i * 10 + D0.i + 1) AS seq
23+
FROM Digits AS D0, Digits AS D1, Digits AS D2, Digits AS D3,
24+
Digits AS D4, Digits AS D5, Digits AS D6
25+
END
26+
27+
IF OBJECT_ID (N'LongestCommonSubstring') IS NOT NULL
28+
DROP FUNCTION LongestCommonSubstring
29+
GO
30+
31+
CREATE FUNCTION LongestCommonSubstring
32+
/**
33+
summary: >
34+
The longest common subSubstring (LCS) tells you the longest common substring between two strings.
35+
If you, for example, were to compare 'And the Dish ran away with the Spoon' with 'away', you'd
36+
get 'away' as being the string in common. Likewise, comparing '465932859472109683472' with
37+
'697834859472135348' would give you '8594721'. This returns a one-row table that gives you the
38+
length and location of the string as well as the string itself. It can easily be modified to give
39+
you all the substrings (whatever your criteria for the smallest substring. E.g. two characters?
40+
41+
Author: Phil Factor
42+
Revision: 1.0
43+
date: 05 Dec 2014
44+
example:
45+
code: |
46+
Select * from dbo.LongestCommonSubstring ('1234', '1224533324')
47+
Select * from dbo.LongestCommonSubstring ('thisisatest', 'testing123testing')
48+
Select * from dbo.LongestCommonSubstring ( 'findthishere', 'where is this?')
49+
Select * from dbo.LongestCommonSubstring ( null, 'xab')
50+
Select * from dbo.LongestCommonSubstring ( 'not beginning-middle-ending',
51+
'beginning-diddle-dum-ending')
52+
returns: >
53+
the longest common subString as a string
54+
**/
55+
(
56+
@firstString VARCHAR(MAX),
57+
@SecondString VARCHAR(MAX)
58+
)
59+
RETURNS @hit TABLE --returns a single row table
60+
--(it is easy to change to return a string but I wanted the location of the match)
61+
(
62+
MatchLength INT,--the length of the match. Not necessarily the length of input
63+
FirstCharInMatch INT,--first character of match in first string
64+
FirstCharInString INT,--first character of match in second string
65+
CommonString VARCHAR(8000) --the part of the FirstString successfully matched
66+
)
67+
68+
AS BEGIN
69+
DECLARE @Order INT, @TheGroup INT, @Sequential INT
70+
--this table is used to do a quirky update to enable a grouping only on sequential characters
71+
DECLARE @Scratch TABLE (TheRightOrder INT IDENTITY PRIMARY KEY,TheGroup smallint, Sequential INT,
72+
FirstOrder smallint, SecondOrder smallint, ch CHAR(1))
73+
--first we reduce the amount of data to those characters in the first string that have a match
74+
--in the second, and where they were.
75+
INSERT INTO @Scratch ( TheGroup , firstorder, secondorder, ch)
76+
SELECT Thefirst.number-TheSecond.number AS TheGroup,Thefirst.number, TheSecond.number, TheSecond.ch
77+
FROM --divide up the first string into a table of characters/sequence
78+
(SELECT number, SUBSTRING(@FirstString,number,1) AS ch
79+
FROM numbers WHERE number <= LEN(@FirstString)) TheFirst
80+
INNER JOIN --divide up the second string into a table of characters/sequence
81+
(SELECT number, SUBSTRING(@SecondString,number,1) AS ch
82+
FROM numbers WHERE number <= LEN(@SecondString)) TheSecond
83+
ON Thefirst.ch= Thesecond.ch --do all valid matches
84+
ORDER BY Thefirst.number-TheSecond.number, TheSecond.number
85+
--now @scratch has all matches in the correct order for checking unbroken sequence
86+
SELECT @Order=-1, @TheGroup=-1, @Sequential=0 --initialise everything
87+
UPDATE @Scratch --now check by incrementing a value every time a sequence is broken
88+
SET @Sequential=Sequential =
89+
CASE --if it is not a sequence from the one before increment the variable
90+
WHEN secondorder=@order+1 AND TheGroup=@TheGroup
91+
THEN @Sequential ELSE @Sequential+1 END,
92+
@Order=secondorder,
93+
@TheGroup=TheGroup
94+
--now we just aggregate it, and choose the first longest match. Easy
95+
INSERT INTO @hit (MatchLength,FirstCharInMatch, FirstCharInString,CommonString)
96+
SELECT TOP 1 ---just the first. You may want more so feel free to change
97+
COUNT(*) AS MatchLength,
98+
MIN(firstorder) FirstCharInMatch,
99+
MIN(secondorder) AS FirstCharInString,
100+
SUBSTRING(@SecondString,
101+
MIN(secondorder),
102+
COUNT(*)) AS CommonString
103+
FROM @scratch GROUP BY TheGroup,Sequential
104+
ORDER BY COUNT(*) DESC, MIN(firstOrder) ASC, MIN(SecondOrder) ASC
105+
RETURN
106+
END--and we do a test run
107+
108+
go
109+
110+
--do an outer apply to check the obvious flaws and raise an error
111+
--if any erros appear.
112+
IF EXISTS (
113+
SELECT firstString, secondString,correct, LCS.*
114+
FROM (VALUES
115+
('Call me Ishmael. Some years ago...','Something','Some' ),
116+
('unrestfulness','having little or no money in my purse, and nothing particular to interest me on shore','rest' ),
117+
('1234563457','3456','3456' ),
118+
('','',NULL ),
119+
(NULL,'',NULL ),
120+
('I find myself involuntarily pausing before coffin warehouses','Jailhouse rock','house'),
121+
(',.-=dfgd%','-=','-='),
122+
('protest is useless','I need to test this routine. Tests are valuable','test' )
123+
)
124+
125+
AS X(FirstString,secondString, Correct)
126+
OUTER APPLY dbo.LongestCommonSubstring(firstString, secondString) AS LCS
127+
WHERE COALESCE(correct,'null')<>COALESCE(LCS.CommonString,'null')
128+
)
129+
RAISERROR ('the LongestCommonSubstring routine has broken',16,1)

0 commit comments

Comments
 (0)