相似度函数
概述
比较两个字段的相似度
最近有人问到关于两个字段求相似度的函数,所以就写了一篇关于相似度的函数,分别是“简单的模糊匹配”,“顺序匹配”,“一对一位置匹配”。在平时的这种函数可能会需要用到,可能业务需求不一样,这里只给出参照,实际情况可以相对修改。
本文所有的两个字段比较都是除以比较字段本身,例如A与B比较,找出的长度除以A的长度,因为考虑如果A的长度大于B的长度,相似度会超100%,例如‘abbc’,'ab'.
如果大家想除以B的长度,只需要在语句末尾将‘ SET @num=@num*1.0/LEN(@Cloumna) ’修改成‘ SET @num=@num*1.0/LEN(@Cloumnb) ’
步骤
1.两个字段简单相似
--
-两个字段简单相似
CREATE
FUNCTION
DBO.FN_Resemble
(
@Cloumna
NVARCHAR
(
MAX
),
@Cloumnb
NVARCHAR
(
MAX
)
)
RETURNS
FLOAT
AS
BEGIN
DECLARE
@num
FLOAT
,
@len
int
SET
@Cloumna
=
ISNULL
(
@Cloumna
,
0
)
SET
@Cloumnb
=
ISNULL
(
@Cloumnb
,
0
)
SET
@len
=
1
SET
@num
=
0
WHILE
(
LEN
(
@Cloumna
)
<>
0
AND
LEN
(
@CloumnB
)
<>
0
)
BEGIN
WHILE
(
@len
<=
LEN
(
@Cloumna
))
BEGIN
DECLARE
@a
NVARCHAR
(
4
)
SET
@a
=
''
SET
@a
=
SUBSTRING
(
@Cloumna
,
@len
,
1
)
IF
(
CHARINDEX
(
@a
,
@CloumnB
)
>
0
)
BEGIN
SET
@num
=
@num
+
1
END
SET
@len
=
@len
+
1
END
SET
@num
=
@num
*
1.0
/
LEN
(
@Cloumna
)
BREAK
END
RETURN
@num
END
--
--测试代码
SELECT
DBO.FN_Resemble(
'
ABDC321G
'
,
'
ABDC123G
'
)
2.两个字段顺序相似
--
-两个字段顺序相似
CREATE
FUNCTION
DBO.FN_Resemble_order
(
@Cloumna
NVARCHAR
(
MAX
),
@Cloumnb
NVARCHAR
(
MAX
)
)
RETURNS
FLOAT
AS
BEGIN
DECLARE
@num
FLOAT
,
@len
int
SET
@Cloumna
=
ISNULL
(
@Cloumna
,
0
)
SET
@Cloumnb
=
ISNULL
(
@Cloumnb
,
0
)
SET
@len
=
1
SET
@num
=
0
WHILE
(
LEN
(
@Cloumna
)
<>
0
AND
LEN
(
@CloumnB
)
<>
0
)
BEGIN
DECLARE
@a
NVARCHAR
(
4
)
DECLARE
@b
NVARCHAR
(
4
)
IF
(
LEN
(
@Cloumna
)
>=
LEN
(
@CloumnB
))
BEGIN
WHILE
(
@len
<=
LEN
(
@CloumnB
))
BEGIN
SET
@a
=
''
SET
@a
=
SUBSTRING
(
@Cloumna
,
@len
,
1
)
SET
@b
=
''
SET
@b
=
SUBSTRING
(
@CloumnB
,
@len
,
1
)
IF
(
@a
=
@b
)
BEGIN
SET
@num
=
@num
+
1
END
ELSE
BEGIN
break
END
SET
@len
=
@len
+
1
END
END
ELSE
IF
(
LEN
(
@Cloumna
)
<
LEN
(
@CloumnB
))
BEGIN
WHILE
(
@len
<=
LEN
(
@Cloumna
))
BEGIN
SET
@a
=
''
SET
@a
=
SUBSTRING
(
@Cloumna
,
@len
,
1
)
SET
@b
=
''
SET
@b
=
SUBSTRING
(
@CloumnB
,
@len
,
1
)
IF
(
@a
=
@b
)
BEGIN
SET
@num
=
@num
+
1
END
ELSE
BEGIN
break
END
SET
@len
=
@len
+
1
END
END
SET
@num
=
@num
*
1.0
/
LEN
(
@Cloumna
)
BREAK
END
RETURN
@num
END
go
--
--测试代码
SELECT
DBO.FN_Resemble_order(
'
ABDC456G
'
,
'
ABDC123G
'
)
3.两个字段一对一相似
--
-两个字段一对一相似
CREATE
FUNCTION
DBO.FN_Resemble_onebyone
(
@Cloumna
NVARCHAR
(
MAX
),
@Cloumnb
NVARCHAR
(
MAX
)
)
RETURNS
FLOAT
AS
BEGIN
DECLARE
@num
FLOAT
,
@len
int
SET
@Cloumna
=
ISNULL
(
@Cloumna
,
0
)
SET
@Cloumnb
=
ISNULL
(
@Cloumnb
,
0
)
SET
@len
=
1
SET
@num
=
0
WHILE
(
LEN
(
@Cloumna
)
<>
0
AND
LEN
(
@CloumnB
)
<>
0
)
BEGIN
DECLARE
@a
NVARCHAR
(
4
)
DECLARE
@b
NVARCHAR
(
4
)
IF
(
LEN
(
@Cloumna
)
>=
LEN
(
@CloumnB
))
BEGIN
WHILE
(
@len
<=
LEN
(
@CloumnB
))
BEGIN
SET
@a
=
''
SET
@a
=
SUBSTRING
(
@Cloumna
,
@len
,
1
)
SET
@b
=
''
SET
@b
=
SUBSTRING
(
@CloumnB
,
@len
,
1
)
IF
(
@a
=
@b
)
BEGIN
SET
@num
=
@num
+
1
END
SET
@len
=
@len
+
1
END
END
ELSE
IF
(
LEN
(
@Cloumna
)
<
LEN
(
@CloumnB
))
BEGIN
WHILE
(
@len
<=
LEN
(
@Cloumna
))
BEGIN
SET
@a
=
''
SET
@a
=
SUBSTRING
(
@Cloumna
,
@len
,
1
)
SET
@b
=
''
SET
@b
=
SUBSTRING
(
@CloumnB
,
@len
,
1
)
IF
(
@a
=
@b
)
BEGIN
SET
@num
=
@num
+
1
END
SET
@len
=
@len
+
1
END
END
SET
@num
=
@num
*
1.0
/
LEN
(
@Cloumna
)
BREAK
END
RETURN
@num
END
--
--测试代码
SELECT
DBO.FN_Resemble_onebyone(
'
ABDC456G
'
,
'
ABDC123G
'
)
总结
如果大家觉得文章对大家有帮助,麻烦给个推荐,谢谢。
|
备注: 作者: pursuer.chen 博客: http://www.cnblogs.com/chenmh 本站点所有随笔都是原创,欢迎大家转载;但转载时必须注明文章来源,且在文章开头明显处给明链接,否则保留追究责任的权利。 《欢迎交流讨论》 |

