长话短说。我有数据试图通过地址识别重复记录。地址可以在[Address]
上匹配或 [Remit_Address]
领域。
我使用 JOIN 和 UNION 来获取记录,但我需要匹配的记录在结果中相互显示。
我无法按任何现有字段排序,因此典型的“ORDER BY”不起作用。我查看了 RANK
正如某人所建议的那样,它看起来可能有效,但我不知道如何进行分区,而且我认为 Order 给了我与 ORDER BY
相同的问题。 .
如果 RANK 不是最佳选择,我愿意接受其他想法。最终的目标是以某种方式对匹配的记录进行分组。
这是设置:
-- Output Table
CREATE TABLE [dupecheck] (
[id] int identity(1, 1),
[Data Area] varchar(255),
[Supplier_No] varchar(255),
[Name] varchar(255),
[Address] varchar(255),
[City] varchar(255),
[State] varchar(255),
[Zip] varchar(255),
[Country] varchar(255),
[Remit_Address] varchar(255),
[Remit_City] varchar(255),
[Remit_State] varchar(255),
[Remit_Zip] varchar(255),
[Remit_Country] varchar(255),
)
CREATE TABLE [sample_data] (
[Supplier_No] varchar(255),
[Name] varchar(255),
[Address] varchar(255),
[City] varchar(255),
[State] varchar(255),
[Zip] varchar(255),
[Country] varchar(255),
[Remit_Address] varchar(255),
[Remit_City] varchar(255),
[Remit_State] varchar(255),
[Remit_Zip] varchar(255),
[Remit_Country] varchar(255),
[cleanAddress] varchar(255),
[cleanRemit_Address] varchar(255),
CONSTRAINT [suppliers_pk] PRIMARY KEY ([Supplier_No])
)
INSERT INTO [sample_data] VALUES
('1039104','Geez Companies','100 Aero Hudson Rd','Streetsboro','OH','44241','','100 Aero Hudson Road','Streetsboro','OH','44241','USA','100 Aero Hudson Rd','100 Aero Hudson Rd'),
('1218409','SouthWestern Medical','100 West Balor Ave','Osceola','AR','72370','USA','SouthWestern Medical100 W Balor Ave','Osceola','AR','72370','USA','100 W Balor Ave','SouthWestern Medical100 W Balor Ave'),
('1243789','SouthWestern Medical','100 West Balor Ave','Osceola','AR','72370','USA','SouthWestern Medical100 West Balor Ave','Osceola','AR','72370','USA','100 W Balor Ave','SouthWestern Medical100 W Balor Ave'),
('1243636','SIRI SYSTEMS','15 BRAD ROAD','WEXFORD','PA','15090','','','','','','','15 BRAD RD',''),
('1152482','FLEETWOOD MACK','22 WINDSOCK CT','ADDISON','IL','60101','','PO BOX 951','CHICAGO','IL','60694-5124','','22 WINDSOCK CT','PO BOX 951'),
('1224483','Aerospace Junction','211500 Communicate Ave','Mingo Junction','OH','43939','USA','P O Box 99','Mingo Junction','OH','43939','USA','211500 Communicate Ave','PO Box 99'),
('1243397','Squeezy Felt','SCHREIBER DIST','NEW KENSINGTON','PA','15068','','','','','','','SCHREIBER DIST',''),
('1230895','NERO CO','28 North US State Highway 99','Osceola','AR','72370','USA','PO Box 204','Cape Girardeau','MO','63702-2045','USA','28 N US State Hwy 99','PO Box 204'),
('1243782','NERO CO','28 North US State Highway 99','Osceola','AR','72370','USA','PO Box 204','Cape Girardeau','MO','63702-2045','USA','28 N US State Hwy 99','PO Box 204'),
('1135880','RICHARD PRYOR SEMINARS','PO BOX 2194','KANSAS CITY','MO','64121-9468','USA','RICHARD PRYOR SEMINARS P O BOX 2194','KANSAS CITY','MO','64121-9468','USA','PO BOX 2194','RICHARD PRYOR SEMINARS PO BOX 2194'),
('1241328','INFINITY AND BEYOND','P.O. BOX 169','GASTONIA','NC','28053-0269','USA','','','','','','PO BOX 169',''),
('1259522','ZEEBO INC','PO BOX 169','GASSTONIA','NC','28053-0269','USA','','','','','','PO BOX 169',''),
('1255253','AT&T','PO Box 50221','Carol Stream','IL','60197','USA','','','','','','PO Box 50221',''),
('1135513','AT&T','PO Box 50221','Carol Stream','IL','60197-5080','USA','','','','','','PO Box 50221',''),
('1119161','Machine Co, Inc','3306 N Thorne Blvd','Chattanooga','TN','','','PO BOX 5301','CHATTANOOGA','TN','37406','USA','3306 N Thorne Blvd','PO BOX 5301'),
('1176587','Topsy Turvy','365 Welmington Road','Chicago','IL','60606','USA','','','','','','365 Welmington Rd',''),
('2156671','Topsy Turvvy, Inc.','P.O. Box 55217','Columbus','OH','43081','','365 Welmington Road','Chicago','IL','60606','USA','','365 Welmington Rd')
CREATE TABLE [dupe_addresses](
[NewAdd] [varchar](255) NULL
)
INSERT INTO [dupe_addresses] VALUES
('100 W Balor Ave'),
('28 N US State Hwy 99'),
('365 Welmington Rd'),
('PO BOX 169'),
('PO Box 204'),
('PO Box 50221'),
('SouthWestern Medical100 W Balor Ave')
现有查询:INSERT INTO [dupecheck]
SELECT * FROM (
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country]
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] <> '' AND [Address] IS NOT NULL ) )
WHERE ([Supplier_No] IS NOT NULL AND [Supplier_No] <> '')
UNION
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country]
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] <> '' AND [Remit_Address] IS NOT NULL) )
WHERE ([Supplier_No] IS NOT NULL AND [Supplier_No] <> '')
) q1
当前结果:Reason Supplier_No Name Address City State Zip Country Remit_Address Remit_City Remit_State Remit_Zip Remit_Country
Address Match 1135513 AT&T PO Box 50221 Carol Stream IL 60197-5080 USA
Address Match 1176587 Topsy Turvy 365 Welmington Road Chicago IL 60606 USA
Address Match 1218409 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 W Balor Ave Osceola AR 72370 USA
Address Match 1230895 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA
Address Match 1241328 INFINITY AND BEYOND P.O. BOX 169 GASTONIA NC 28053-0269 USA
Address Match 1243782 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA
Address Match 1243789 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 West Balor Ave Osceola AR 72370 USA
Address Match 1255253 AT&T PO Box 50221 Carol Stream IL 60197 USA
Address Match 1259522 ZEEBO INC PO BOX 169 GASSTONIA NC 28053-0269 USA
Address Match 2156671 Topsy Turvvy, Inc. P.O. Box 55217 Columbus OH 43081 365 Welmington Road Chicago IL 60606 USA
预期结果:Reason Supplier_No Name Address City State Zip Country Remit_Address Remit_City Remit_State Remit_Zip Remit_Country rank
Address Match 1135513 AT&T PO Box 50221 Carol Stream IL 60197-5080 USA 1
Address Match 1255253 AT&T PO Box 50221 Carol Stream IL 60197 USA 1
Address Match 1241328 INFINITY AND BEYOND P.O. BOX 169 GASTONIA NC 28053-0269 USA 2
Address Match 1259522 ZEEBO INC PO BOX 169 GASSTONIA NC 28053-0269 USA 2
Address Match 1243782 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA 3
Address Match 1230895 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA 3
Address Match 1218409 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 W Balor Ave Osceola AR 72370 USA 4
Address Match 1243789 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 West Balor Ave Osceola AR 72370 USA 4
Address Match 2156671 Topsy Turvvy, Inc. P.O. Box 55217 Columbus OH 43081 365 Welmington Road Chicago IL 60606 USA 5
Address Match 1176587 Topsy Turvy 365 Welmington Road Chicago IL 60606 USA 5
最佳答案
此查询创建所需的结果。
with cte as (
select s2.NewAdd grp, s1.*
, rank() over(partition by Supplier_No order by s2.NewAdd) rnk
from sample_data s1
inner join dupe_addresses s2 on
(s1.cleanAddress=s2.newAdd) or (s1.cleanRemit_Address=s2.newAdd)
)
select c1.*
from cte c1
where rnk = 1
order by c1.grp
删除了 Union ,并通过 OR
组合了两个连接条件反而。因此可以找到同时具有这两个条件的记录。
rank()
用于计算结果集分区内每一行的排名。partition by Supplier_No
用于识别重复记录。最后,使用
where rnk = 1
不重复地查看记录组。
关于sql - 如何使用 RANK 对匹配记录进行分组,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/67016725/