Below is my query utilizing a logic that determines a duplicate based on 3 matching identity fields.
The 5 identity fields we are using to match a duplicate is:
firstname
lastname
IDnumber
dob
addressstring
select *
from
(
select outerlogic.*
from (
SELECT
subid, lastOrig, firstOrig, lastname, firstname, idnumber, dob, addressString, pra_address1, pra_city, pra_state, pra_zipcode,
decode (
greatest(length(ct1), length(ct2), length(ct3), length(ct4), length(ct5),
length(ct6), length(ct7), length(ct8), length(ct9), length(ct10)),
length(ct1), ct1,
length(ct2), ct2,
length(ct3), ct3,
length(ct4), ct4,
length(ct5), ct5,
length(ct6), ct6,
length(ct7), ct7,
length(ct8), ct8,
length(ct9), ct9,
length(ct10), ct10
) idList
FROM
(
SELECT myQ.*
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY firstname, lastname, idnumber) AS ct1
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY firstname, dob, idnumber) AS ct2
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY firstname,idnumber,addressString) AS ct3
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY firstname, lastname, addressString) AS ct4
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY dob, idnumber, addressString) AS ct5
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY lastname, idnumber,addressString) AS ct6
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY firstname, dob,addressString) AS ct7
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY lastname, dob,addressString) AS ct8
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY firstname, lastname,dob) AS ct9
, listagg(subid, ',') within group (order by subid) OVER (PARTITION BY lastname,dob,idnumber) AS ct10
FROM (
select subid, lexid, lastname lastOrig, firstname firstOrig,
Regexp_replace(Upper(Trim(lastname)),'((\W)(DEL|DE\s+LA|DE|E|Y)($|\W))|'
||'((^)(DEL|DE\s+LA|DE|E|Y)(\W))|'
||'((\W)(MR|MRS|SR|JR|MS|MISS|II|III|IV|V)($|\W))|'
||'((^)(MR|MRS|SR|JR|MS|MISS|II|III|IV|V)(\W))|'
||'((^)([a-zA-Z])(\W))|'
||'([^a-zA-Z])','')
lastname,
Regexp_replace(Upper(Trim(firstname)),'((\W)(MR|MRS|SR|JR|MS|MISS|II|III|IV|V)($|\W))|'
||'((^)(MR|MRS|SR|JR|MS|MISS|II|III|IV|V)(\W))|'
||'([^a-zA-Z])','')
firstname,
trim(idnumber) idnumber, dob,
UPPER(trim(ADDRESS1))||UPPER(trim(CITY))||UPPER(trim(STATE))||ZIPCODE addressString,
address1 pra_address1, city pra_city, state pra_state, zipcode pra_zipcode
from subscriber
)logic
)
WHERE 1=1
AND
(
instr(ct1, ',') > 0
or instr(ct2, ',') > 0
or instr(ct3, ',') > 0
or instr(ct4, ',') > 0
or instr(ct5, ',') > 0
or instr(ct6, ',') > 0
or instr(ct7, ',') > 0
or instr(ct8, ',') > 0
or instr(ct9, ',') > 0
or instr(ct10, ',') > 0
)
group by subid, lastOrig,firstOrig,lastname, firstname, idnumber, dob, addressString, pra_address1, pra_city, pra_state, pra_zipcode,
greatest(length(ct1), length(ct2), length(ct3), length(ct4), length(ct5),
length(ct6), length(ct7), length(ct8), length(ct9), length(ct10)),
length(ct1), ct1,
length(ct2), ct2,
length(ct3), ct3,
length(ct4), ct4,
length(ct5), ct5,
length(ct6), ct6,
length(ct7), ct7,
length(ct8), ct8,
length(ct9), ct9,
length(ct10), ct10
)outerlogic
)
order by idList
;
Instead of relying on exact matches, I am hoping to implement the following conditions on the 5 identity fields:
First name and last name are switched (ex. MARY SMITH, SMITH MARY)
Partial names in same field (ex. MARY = MARY LOU)
DOB - MonthDate, DateYear, MonthYear match (1/1/1990 = 1/12/1990 or 12/12/1912 = 1/12/1912)
ID - 2 digits are matching in same digit field (ex. 1234 = 1256, and 1234 not = 4321)
AddressString - extracting only the street name (123 Main St = Main)