SQLTeam.com | Weblogs | Forums

Show all rows which have a duplicate


#1

Hi

I have the following SQL query which shows me which Supplier_Part_Numbers have more than one Part_Number listed against them,

SELECT dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number)))
	, COUNT(a1.Part_Number)
FROM dbo.Supplier_Price_File as a1
GROUP BY dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number)))
HAVING COUNT(a1.Part_Number) > 1

The dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number))) is standardising all the Supplier_Part_Numbers to find the all duplicate values.

I would like the results to show the part_number which the supplier_part_number is listed against. Something like

Supplier_Code_1 | Part_Number_A
Supplier_Code_1 | Part_Number_B
Supplier_Code_2 | Part_Number_C
Supplier_Code_2 | Part_Number_D
Supplier_Code_2 | Part_Number_E

I have tried various methods of joins but none seem to be working correctly or very quickly.

Any suggestions would be great.

Thanks


#2

This won't be fast!

SELECT SupplierCode, Supplier_Part_Number
FROM dbo.Supplier_Price_File as a1
WHERE dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number))) IN
	(
		SELECT dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number))) AS MyPartNo
		--	, COUNT(a1.Part_Number)
		FROM dbo.Supplier_Price_File as a1
		GROUP BY dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number)))
		HAVING COUNT(a1.Part_Number) > 1
)

not sure of the exact code (posting DDL and sample data would allow a tested solution ...), but maybe something like this would do

SELECT *
FROM
(
	SELECT	[MyProductCode] = dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number)))
		, SupplierCode
		, Supplier_Part_Number
		, [T_GroupRowCount] = COUNT(*) OVER(PARTITION BY dbo.RemoveNonAlphaCharacters(RTRIM(LTRIM(Supplier_Part_Number))))
	FROM dbo.Supplier_Price_File as a1
) AS T
WHERE	T_GroupRowCount > 1

#3

The following should be a fairly speedy method... It chugged through 1M rows in just under 2 seconds on a fairly anemic test machine...

--============================================================================
-- create 1M rows of test data
IF OBJECT_ID('tempdb..#Supplier_Parts', 'U') IS NOT NULL 
DROP TABLE #Supplier_Parts;

CREATE TABLE #Supplier_Parts (
	Suplier_Code int NOT NULL,
	Part_Number CHAR(10) NOT NULL 
	);

WITH 
	cte_n1 (n) AS (SELECT 1 FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) n (n)), 
	cte_n2 (n) AS (SELECT 1 FROM cte_n1 a CROSS JOIN cte_n1 b),
	cte_n3 (n) AS (SELECT 1 FROM cte_n2 a CROSS JOIN cte_n2 b),
	cte_Tally (n) AS (
		SELECT TOP 1000000
			ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
		FROM
			cte_n3 a CROSS JOIN cte_n3 b
		)
INSERT #Supplier_Parts (Suplier_Code, Part_Number)
SELECT 
	Suplier_Code = ABS(CHECKSUM(NEWID()) % 500) + 1,
	Part_Number = CONCAT(a.Val, b.Val, c.Val, d.Val, e.Val, f.Val, g.Val, h.Val, i.Val, j.Val)
FROM
	cte_Tally t
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) a (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) b (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) c (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) d (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) e (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) f (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) g (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) h (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) i (Val)
	CROSS APPLY ( VALUES (CHAR(ABS(CHECKSUM(NEWID()) % 26) + 65)) ) j (Val);

-- add a covering "POC" index... to remove the sort operation.
CREATE NONCLUSTERED INDEX ix_SupplierParts_SupplierCode ON #Supplier_Parts (
	Suplier_Code
	)
INCLUDE (
	Part_Number
	);

--============================================================================
-- the solution... 
WITH 
	cte_Supplier_Part_Counts AS (
		SELECT 
			sp.Suplier_Code, 
			sp.Part_Number,
			Sup_Part_Count = COUNT(1) OVER (PARTITION BY sp.Suplier_Code)
		FROM
			#Supplier_Parts sp
		)
SELECT 
	spc.Suplier_Code, 
	spc.Part_Number
	INTO #temp --<< dump to a temp table to prevent display rendering...
FROM
	cte_Supplier_Part_Counts spc
WHERE 
	spc.Sup_Part_Count > 1;

DROP TABLE #temp;