Find the last modified date for an attribute

tiki1980 · January 31, 2018, 12:34pm

I have got a table with a key and an attribute. For this I want to find the last_modified_dt of the reference value bases on the refrence_date, see the last column of the example for output i'd like to have. This is a bit more difficult then initially thought off because of the repeating values and the date one likes to have there. See the value X and the dates that have to be determined.

I've come up with the following solution but i'm curious to find out if there's a better one. At the bottom you will find a download link to my solution and a generator that will create 10.000.000 rows of data. On my system my solution will run for 3.7633 minutes.

My solution

SET STATISTICS TIME ON;

WITH base_selection
AS (SELECT
          id
         ,key_value
         ,reference_date
         ,reference_value
         -- determine the previous reference_value (can be removed from final solution)
		 ,LAG(reference_value, 1,NULL) OVER (PARTITION BY key_value ORDER BY reference_date ASC) AS previous_reference_value
         --if previous reference value = equal to current value then 0 else 1. This to determine the first occurence of a value
         ,CASE
              WHEN LAG(reference_value, 1, NULL) OVER (PARTITION BY key_value ORDER BY reference_date ASC) = reference_value THEN 0
              ELSE 1
          END                                                                                                AS first_occurance_of_value
    FROM  testdata
    WHERE 1 = 1
--AND key_value = 'K0'
)
-- determine the first occurences
    ,first_occurances
AS (SELECT *
    FROM   base_selection
    WHERE  first_occurance_of_value = 1)
    ,match_on_first_occurance
	-- match on first occurances, this will lead to duplicates where reference value occurs more then once for a key value / first occurance combination
AS (SELECT
         bs.id
        ,bs.key_value
        ,bs.reference_date
        ,bs.reference_value
        ,bs.previous_reference_value
        ,bs.first_occurance_of_value
        ,fs.reference_date AS reference_value_modified_dt
    FROM base_selection                   bs
         LEFT OUTER JOIN first_occurances fs
             ON fs.key_value            = bs.key_value
                 AND fs.reference_value = bs.reference_value
                 AND fs.reference_date  <= bs.reference_date)
    ,deduplicate
-- deduplicate the set, find the reference date as close as possible to the reference date
AS (SELECT
         id
        ,key_value
        ,reference_date
        ,reference_value
        ,first_occurance_of_value
        ,match_on_first_occurance.reference_value_modified_dt
        ,ROW_NUMBER() OVER (PARTITION BY
                                key_value
                               ,reference_date
                            ORDER BY
                                match_on_first_occurance.reference_value_modified_dt DESC
                           ) AS row_nr
    FROM match_on_first_occurance)
SELECT *
FROM   deduplicate
WHERE  row_nr = 1;

SET STATISTICS TIME OFF;

test data generator

--DROP TABLE testdata;
--GO

CREATE TABLE testdata
(
    id              INT         IDENTITY NOT NULL
   ,key_value       VARCHAR(20) NOT NULL
   ,reference_date  DATE        NOT NULL
   ,reference_value CHAR(1)     NULL
   ,
   PRIMARY KEY CLUSTERED
   (
       key_value
      ,reference_date
   )
);

DECLARE @values AS TABLE
(
    reference_date  DATE         NOT NULL
   ,reference_value NVARCHAR(10) NULL
);

DECLARE
    @key         VARCHAR(20)
   ,@nr_of_sets  INT = 1000000 --will result in 10.000.000 rows
   ,@current_set INT = 0;


INSERT INTO @values
(
    reference_date
   ,reference_value
)

VALUES
    ('2000-01-01', NULL)
   ,('2001-01-01', 'X')
   ,('2002-01-01', 'X')
   ,('2002-02-01', 'Y')
   ,('2003-01-01', 'Y')
   ,('2004-01-01', 'X')
   ,('2005-01-01', NULL)
   ,('2006-01-01', 'X')
   ,('2007-01-01', 'X')
   ,('2010-01-01', 'X');


WHILE @current_set < @nr_of_sets
BEGIN
    SET @key = 'K' + CAST(@current_set AS VARCHAR(10));

    INSERT INTO testdata
    (
        key_value
       ,reference_date
       ,reference_value
    )
    SELECT
         @key
        ,reference_date
        ,reference_value
    FROM @values;

    SET @current_set = @current_set + 1;
END;
GO

SELECT COUNT(1)
FROM   testdata;
GO

My colleaguefound the following solution using a temptable and "quirky updates" . Quirky update. But i'm looking for a solution without temporary tables. (this runs within 50 seconds on my machine)

temp table with quirky update

-- DROP TABLE #testdata;
-- GO

CREATE TABLE #testdata
(
    id              INT         IDENTITY PRIMARY KEY CLUSTERED
   ,key_value       VARCHAR(20) NOT NULL
   ,reference_date  DATE        NULL
   ,reference_value CHAR(1)     NULL
   ,date_first      DATE        NULL
);

INSERT #testdata
(
    key_value
   ,reference_date
   ,reference_value
)
SELECT
     key_value
    ,reference_date
    ,reference_value
FROM testdata
ORDER BY
     1
    ,2;

DECLARE
    @previous_value NVARCHAR(10)
   ,@date_first     DATE;

UPDATE #testdata
SET
       @date_first=date_first = CASE
                                    WHEN reference_value = @previous_value THEN @date_first
                                    ELSE reference_date
                                END
      ,@previous_value = reference_value;


SELECT *
FROM   #testdata
WHERE  key_value = 'K0'
ORDER BY
       reference_date;

test data generator and my solution in t-sql

tiki1980 · February 1, 2018, 9:20am

Another colleague provided the solution below which is far more elegant then my solution

WITH inp
AS (SELECT
         id
        ,key_value
        ,reference_date
        ,reference_value
        ,CASE
             WHEN LAG(reference_value) OVER (PARTITION BY key_value ORDER BY reference_date) = reference_value THEN CAST(NULL AS DATE)
             ELSE reference_date
         END AS reference_date_change
    FROM testdata)
SELECT
     id
    ,key_value
    ,reference_date
    ,reference_value
    ,CASE
         WHEN reference_value IS NULL THEN NULL
         ELSE MAX(reference_date_change) OVER (PARTITION BY key_value
                                               ORDER BY
                                                   reference_date
                                               ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                                              )
     END reference_date_change
FROM inp;

harishgg1 · December 19, 2018, 3:55pm

hi

i know this topic was from a long long time ago

i used recursive cte to do it
A different way to do it ...
Performance wise .. recursive cte's are very bad
I reached the limit of recusive cte 32767.
May be there is way to exceed the limit .. did not check

just a different way

SQL..

; WITH rec_cte 
     AS (SELECT *, 
                reference_date  AS grp 
         FROM   testdata 
         WHERE  id = 1 
         UNION ALL 
         SELECT a.*, 
                CASE 
                  WHEN Isnull(a.reference_value, 'PP') = 
                       Isnull(b.reference_value, 'PP') 
                       AND a.reference_date < b.grp THEN a.reference_date 
                  WHEN Isnull(a.reference_value, 'PP') <> 
                       Isnull(b.reference_value, 'PP') 
                THEN 
                  a.reference_date 
                  ELSE b.grp 
                END AS grp 
         FROM   testdata a 
                JOIN rec_cte b 
                  ON a.id = b.id + 1) 
SELECT * 
FROM   rec_cte 

go