Challenge: Find the correct order hierarchy

barnabeck · June 8, 2016, 8:11am

I bumped into a problem that in my opinion is really interesting. I found a relatively easy itinerary solution, but it completely lacks performance when applied to a bigger stack of records... and in the end it turned out to be even wrong!

Imagine a list of names with an underlying, BUT NOT KNOWN order: (we make it visible to make it easier to follow )

Peter, Paul, John, George, Mary, Anne, Eric, Thomas, Susan, Liz

The only thing you have are samples with subsets of those names, listed in the correct hierarchical order.
sample1: John, Anne, Susan
sample2: George, Thomas, Susan, Liz
sample3: Peter, Paul, Eric, Liz
sample4: Paul, Susan
sample5: Mary, Eric, Susan
sample6: Paul, George, Mary, Anne, Thomas
sample7: Paul, John, Mary, Eric

The goal is to find the correct order; or at least a possible correct order for the amount of given samples. Just imagine that there will be 400 names and 500'000 samples.

I post what I thought was a solution: Taking the average ranking within the samples as a measure. This will for sure bring #1 in Top position but lower ranked names will be calculated wrong. You'll need a splitter function that returns the name and its position. like this one -> http://www.sqlservercentral.com/articles/Tally+Table/72993/

Create TABLE #Sample
(
ID		int,
Sample varchar(100)
)
INSERT into #Sample (ID,Sample) values (1,'John,Anne,Susan'),(2,'George,Thomas,Susan,Liz'), (3,'Peter,Paul,Eric,Liz'),(4,'Paul,Susan'),(5,'Mary,Eric,Susan'),(6,'Paul,George,Mary,Anne,Thomas'), (7,'Paul,John,Mary,Eric')

Create TABLE #Names
(
ID		int,
Name	varchar(10),
rank	int
)
Insert into #Names
(
	ID,
Name,
rank
)
SELECT ID, Item, ItemNumber

FROM #Sample
CROSS APPLY
(
	SELECT	Item, Itemnumber FROM dbo.Split(Sample, ',')
) AS Y

select * from #Names

select name, 1.0*sum(rank)/count(rank) as ord 
from #Names
Group by name
order by ord asc

drop Table #Sample
drop Table #Names

barnabeck · June 8, 2016, 11:01am

This is a solution but hardly applicable for the amount of record I have: I look for the name on #top, save his name in the #Ranking table, eliminate him from the samples, and restart again

Create TABLE #Ranking
(
    ID	    int IDENTITY(1,1) NOT NULL,
    Name    varchar(10),
    rank    real
)

declare @loop int
set @loop = 1
declare @max int
set @max = (SELECT COUNT(DISTINCT Name) from #Names)

WHILE @loop <= @max
    BEGIN

    INSERT into #Ranking
    (
        Name,
        rank
    )
    select TOP 1 Name, COALESCE(1.0*SUM(rank)/COUNT(rank),1) as ord from #Names where Name not in (SELECT Name from #Ranking) GROUP BY Name ORDER BY ord asc


    update #Sample set Sample = Replace(Replace(Replace('XXX'+Replace(Replace(Sample,(Select TOP 1 Name from #Ranking order by ID desc),'') + 'XXX',',XXX',''),'XXX,',''),',,',','),'XXX','')

    delete #Names

    Insert into #Names
    (
        ID,
        Name,
        rank
    )
    SELECT ID, Item, ItemNumber

    FROM #Sample
    CROSS APPLY
    (
	SELECT	Item, COALESCE(ItemNumber,1) as ItemNumber FROM dbo.Split(Sample, ',')
     ) AS Y
     WHERE Item <> ''

    set @loop = @loop + 1

END
select * from #Ranking


drop Table #Sample
drop Table #Names
drop Table #Ranking

ScottPletcher · June 8, 2016, 6:34pm

Maybe the code below?! Not sure how long it will run on a large sample table, but of course you can test that :-).

IF OBJECT_ID('tempdb.dbo.#names') IS NOT NULL
    DROP TABLE #names
CREATE TABLE #names (
    name varchar(100) not null,
    sample_id int not null,
    sequence smallint not null
    )
--DROP INDEX names__CL ON #names;
CREATE UNIQUE CLUSTERED INDEX names__CL 
    ON #names ( name, sequence, sample_id ) 
    WITH ( FILLFACTOR = 100 );

IF OBJECT_ID('tempdb.dbo.#results') IS NOT NULL
    DROP TABLE #results
CREATE TABLE #results (
    final_sequence int IDENTITY(1, 1) NOT NULL,
    name varchar(100) NOT NULL
    )
CREATE UNIQUE CLUSTERED INDEX results__CL 
    ON #results ( final_sequence ) 
    WITH ( FILLFACTOR = 100 );

INSERT INTO #names
SELECT LTRIM(RTRIM(ds.Item)) AS name, s.id AS sample_id, ds.ItemNumber AS sequence
FROM #sample s
CROSS APPLY dbo.DelimitedSplit8K(sample, ',') ds

SELECT DISTINCT name FROM #names ORDER BY name --just to show all names

DECLARE @max_sequence_before_this_insert int

WHILE EXISTS(SELECT 1 FROM #names)
BEGIN
    SELECT @max_sequence_before_this_insert = ISNULL(MAX(final_sequence), 0)
    FROM #results
    
    INSERT INTO #results
    SELECT name
    FROM #names
    GROUP BY name
    HAVING MAX(sequence) = 1
    
    UPDATE #names
    SET sequence = sequence - 1
    WHERE sample_id IN (SELECT DISTINCT sample_id FROM #names 
        WHERE name IN (SELECT name FROM #results WHERE final_sequence > @max_sequence_before_this_insert))
    
    DELETE FROM #names
    WHERE name IN (SELECT name FROM #results WHERE final_sequence > @max_sequence_before_this_insert)

END /*WHILE*/

SELECT * FROM #results --show all names in hierarchical order (I hope!)