Fragmentation Test of Clustered Index with large LOB Updates

In another thread

http://forums.sqlteam.com/t/timeoute-expired-the-timeout-period-elapsed-prior-to-completion/?source_topic_id=6271

there was discussion that a table with a Clustered Index on an ever-increasing IDENTITY with 100% Fill Factor was getting badly fragmented. @ScottPletcher suggested that it might be caused by an NVarchar column being extended significantly and causing page splits, and I wanted to have a play with that scenario to learn more about it, but I failed to reproduce the situation, so here's my test code in the hope someone can point out what I have done wrong - probably a School Boy Error !!

My code is based on a very useful article I read by Martin Smith:

http://stackoverflow.com/questions/1251636/what-do-clustered-and-non-clustered-index-actually-mean

DECLARE	@InRowData	varchar(3) = 'ON'	-- IN_ROW_DATA = ON or OFF
	, @NVarcharSize	int = 8001		-- 8001 or 20000 - Size to Update NVarchar column to
	, @Rows		int = 100		-- 100 or 1000 - Number of rows in the test
	, @Modulo	int = 3			-- 3 - Rows with (ID / @Modulo = 1 ) will be updated

CREATE TABLE T
  (
     X INT NOT NULL,
     Y CHAR(3000) NULL,
     Z NVARCHAR(MAX) NULL,
  );

CREATE CLUSTERED INDEX ix
  ON T(X);

EXEC sp_tableoption 'T', 'large value types out of row', @InRowData;


--Insert rows into [T] in random order (so that the table is fragmented)
DECLARE @C1 AS CURSOR,
        @X  AS INT

SET @C1 = CURSOR FAST_FORWARD
FOR SELECT number
    FROM   master..spt_values
    WHERE  type = 'P'
           AND number BETWEEN 1 AND @Rows
    ORDER  BY CRYPT_GEN_RANDOM(4)

OPEN @C1;

FETCH NEXT FROM @C1 INTO @X;

SET NOCOUNT ON;
WHILE @@FETCH_STATUS = 0
  BEGIN
      INSERT INTO T (X)
      VALUES        (@X);

      FETCH NEXT FROM @C1 INTO @X;
  END
SET NOCOUNT OFF;

-- Store the Page Layout in [T_Index1] and fragmentation in [T_Frag1] (to examine later)
SELECT page_id,
       X
--	, geometry::Point(page_id, X, 0).STBuffer(1) AS G
INTO	T_Index1
FROM   T
       CROSS APPLY sys.fn_PhysLocCracker( %% physloc %% )
ORDER  BY 
	X,
	page_id

SELECT	ps.index_id
	, [Partitions] = ps.partition_number
	, [Frag%] = ps.avg_fragmentation_in_percent
	, [Frags] = fragment_count
	, ps.page_count
	, index_depth
	, alloc_unit_type_desc
	, [Name] = OBJECT_NAME(ps.OBJECT_ID)
INTO	T_Frag1
FROM	sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID(N'T'), NULL , NULL, N'Limited') AS ps

-- Rebuild index
ALTER INDEX ix ON T REBUILD;

-- Store the Page Layout in [T_Index2] and fragmentation in [T_Frag2] - which should now be contiguous and not fragmented
SELECT page_id,
       X
--	, geometry::Point(page_id, X, 0).STBuffer(1) AS G
INTO	T_Index2
FROM   T
       CROSS APPLY sys.fn_PhysLocCracker( %% physloc %% )
ORDER  BY 
	X,
	page_id

SELECT	ps.index_id
	, [Partitions] = ps.partition_number
	, [Frag%] = ps.avg_fragmentation_in_percent
	, [Frags] = fragment_count
	, ps.page_count
	, index_depth
	, alloc_unit_type_desc
	, [Name] = OBJECT_NAME(ps.OBJECT_ID)
INTO	T_Frag2
FROM	sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID(N'T'), NULL , NULL, N'Limited') AS ps

-- Select IDs of rows to have their NVarchar column extended (based on @Modulo)
-- Save to #TEMP sorted into random order
SELECT	[ID] = IDENTITY(int, 1, 1),
	X
INTO	#TEMP
FROM	T
WHERE	X % @Modulo = 0
ORDER BY CRYPT_GEN_RANDOM(4)

DECLARE	@ID		int = 0
	, @RowsUpdated	int = 0

-- Extend NVarchar column (in random order in case that causes an issue that a single multi-row UPDATE would not)
SET NOCOUNT ON;
WHILE @ID <> -1
BEGIN
	-- Get next ID to process
	SELECT	TOP 1
		@ID = ID
		, @X = X
	FROM	#TEMP
	WHERE	ID > @ID
	IF @@ROWCOUNT = 0 SELECT @ID = -1	-- Terminate

	UPDATE	T
	SET	Z = REPLICATE('X', @NVarcharSize)
	FROM	T
	WHERE	X = @X
	SELECT	@RowsUpdated = @RowsUpdated + @@ROWCOUNT
END
SET NOCOUNT OFF;

DROP TABLE #TEMP
 
-- Store the Page Layout in [T_Index3] and fragmentation in [T_Frag3] to check for changes before / after rebuild
SELECT page_id,
       X
--	, geometry::Point(page_id, X, 0).STBuffer(1) AS G
INTO	T_Index3
FROM   T
       CROSS APPLY sys.fn_PhysLocCracker( %% physloc %% )
ORDER  BY 
	X,
	page_id

SELECT	ps.index_id
	, [Partitions] = ps.partition_number
	, [Frag%] = ps.avg_fragmentation_in_percent
	, [Frags] = fragment_count
	, ps.page_count
	, index_depth
	, alloc_unit_type_desc
	, [Name] = OBJECT_NAME(ps.OBJECT_ID)
INTO	T_Frag3
FROM	sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID(N'T'), NULL , NULL, N'Limited') AS ps

-- Rebuild index again
ALTER INDEX ix ON T REBUILD;


-- Store the Page Layout after rebuild in [T_Index4] and fragmentation in [T_Frag4]
SELECT page_id,
       X
--	, geometry::Point(page_id, X, 0).STBuffer(1) AS G
INTO	T_Index4
FROM   T
       CROSS APPLY sys.fn_PhysLocCracker( %% physloc %% )
ORDER  BY 
	X,
	page_id

SELECT	ps.index_id
	, [Partitions] = ps.partition_number
	, [Frag%] = ps.avg_fragmentation_in_percent
	, [Frags] = fragment_count
	, ps.page_count
	, index_depth
	, alloc_unit_type_desc
	, [Name] = OBJECT_NAME(ps.OBJECT_ID)
INTO	T_Frag4
FROM	sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID(N'T'), NULL , NULL, N'Limited') AS ps


----------

-- Results:
SET NOCOUNT ON
SELECT	[@InRowData]=@InRowData, [@NVarcharSize]=@NVarcharSize, [@Rows]=@Rows, [@Modulo]=@Modulo, [@RowsUpdated]=@RowsUpdated
PRINT 'Create randon:'
SELECT * FROM T_Frag1
PRINT 'Rebuild index(1):'
SELECT * FROM T_Frag2
PRINT 'Update NVarchar:'
SELECT * FROM T_Frag3
PRINT 'Rebuild index(2):'
SELECT * FROM T_Frag4
SET NOCOUNT OFF

/**
-- Review Row IDs and Pages - if desired
PRINT 'Create randon:'
SELECT * FROM T_Index1 ORDER BY page_id, X
PRINT 'Rebuild index(1):'
SELECT * FROM T_Index2 ORDER BY page_id, X
PRINT 'Update NVarchar:'
SELECT * FROM T_Index3 ORDER BY page_id, X
PRINT 'Rebuild index(2):'
SELECT * FROM T_Index4 ORDER BY page_id, X
--
**/


GO
DROP TABLE #TEMP
GO
DROP TABLE T
GO
DROP TABLE T_Frag1
GO
DROP TABLE T_Index1
GO
DROP TABLE T_Frag2
GO
DROP TABLE T_Index2
GO
DROP TABLE T_Frag3
GO
DROP TABLE T_Index3
GO
DROP TABLE T_Frag4
GO
DROP TABLE T_Index4
GO

You're forcing large values to be stored out of row (in overflow pages), therefore growth in a large value will never cause fragmentation of the main table. The use of "out of row" was specified to alleviate the problem of fragmentation.

Its a choice in the DECLARED variables at the top. I've tried it both ON and OFF and didn't make any appreciable difference, that I could see.

Don't make the string too long or SQL will put it in overflow anyway. Try INSERTing rows to get full or nearly full pages, then increase the length of the nvarchar(max) column by, say, 400 chars (800 bytes). That should force a page split.

Thanks Scott, I'll give that a try.

@Kristen, I wasn't able to follow your example 100%, and you said it didn't show what you wanted anyway. The following example is from my notes from a Paul Randal video that I watched a while ago (MCM videos, if I recall correctly). The code and screenshots are hopefully self explanatory.

-- TEST TABLE----------------------------------------------
IF OBJECT_ID('dbo.T') IS NOT NULL DROP TABLE dbo.T;
GO
CREATE TABLE dbo.T
  (
     X INT NOT NULL,
     Y VARCHAR(2000) NULL,
  );

-- INSERT SOME DATA----------------------------------------
;WITH N(n) AS ( 
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
	UNION ALL SELECT 1
) INSERT INTO dbo.T
        ( X, Y )
SELECT
	ROW_NUMBER() OVER (ORDER BY (SELECT NULL))*10,
	REPLICATE('A',1000)
FROM
	N a CROSS JOIN N b CROSS JOIN N c;
	
-- CREATE CLUSTERED INDEX ---------------------------------

CREATE CLUSTERED INDEX [ix] ON T (x ASC );
GO

Now see how the data is structured.

DBCC TRACEON (3604);
GO

-- Find Root page
DBCC SEMETADATA (N'T');
GO  -- MY HoBt Root: 1:156469

-- see what is in the root page.
DBCC PAGE (N'TestDatabase', 1, 156469, 3);
GO

What I see in the root page is that the leaf pages are all contiguous (Gaps in the first two are to be expected).

-- insert a row between 150 and 220, so the page will split
INSERT INTO T VALUES (151, REPLICATE ('A', 2000));
GO
-- now look at the pages again.
DBCC PAGE (N'TestDatabase', 1, 156469, 3);
GO

The child page Id 4203 in the above screenshot caused fragmentation as a result of an insert.

Now try expanding a column

-- expand 300, which should cause a new page between 156472 and 156473
UPDATE T SET Y = REPLICATE('A',2000) WHERE X = 300;

Thanks very much, the ChildPageID looks to be exactly what I need to see where the situation is occurring, coupled with Scott's suggestion to choose initial, and expand-to, size carefully to induce the problem.

If it is VARCHAR expansion that is causing the O/P's probably maybe the rows could be saved initially with some "slack" space, so that the alteration will fit back where it came from. We use that approach on a table which has an insert shortly followed by an update which increases the size of a VARCHAR. We now INSERT the row with slack space on the VARCHAR which the UPDATE then sets to the actual, desired, content - which is shorter in the majority of cases. Rebuild eventually reclaims the wasted space, but the UPDATEs don't cause every row to be moved (which was our performance problem, rather than a reporting one)