TSQL Challenge 13 - Solution by Jesse McLain



DECLARE @t TABLE (
	InvID INT IDENTITY,
	BatchNumber INT,
	InvoiceNumber INT, 
	VisitDate DATETIME,
	Amount MONEY )
INSERT INTO @t(BatchNumber, InvoiceNumber, VisitDate, Amount)
SELECT 10000001,20001,'2009-01-01',50.00 UNION ALL
SELECT 10000001,20001,'2009-01-02',50.00 UNION ALL
SELECT 10000001,20001,'2009-01-03',50.00 UNION ALL
SELECT 10000001,20001,'2009-01-04',50.00 UNION ALL
SELECT 10000001,20002,'2009-01-01',50.00 UNION ALL
SELECT 10000001,20002,'2009-01-02',50.00 UNION ALL
SELECT 10000001,20002,'2009-01-03',50.00 UNION ALL
SELECT 10000001,20003,'2009-01-01',50.00 UNION ALL
SELECT 10000001,20003,'2009-01-02',50.00 UNION ALL
SELECT 10000001,20004,'2009-01-01',50.00 UNION ALL
SELECT 10000001,20004,'2009-01-02',50.00 UNION ALL
SELECT 10000001,20004,'2009-01-03',50.00 UNION ALL
SELECT 10000001,20004,'2009-01-04',50.00 UNION ALL
SELECT 10000001,20004,'2009-01-05',50.00 UNION ALL
SELECT 10000001,20004,'2009-01-06',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-01',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-02',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-13',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-14',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-15',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-06',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-07',50.00 UNION ALL
SELECT 10000001,20005,'2009-01-08',50.00 UNION ALL
SELECT 10000001,20006,'2009-01-01',50.00 UNION ALL
SELECT 10000002,20007,'2009-01-01',50.00 UNION ALL
SELECT 10000002,20007,'2009-01-02',50.00 UNION ALL
SELECT 10000002,20007,'2009-01-03',50.00 UNION ALL
SELECT 10000002,20008,'2009-01-01',50.00 UNION ALL
SELECT 10000002,20008,'2009-01-02',50.00 UNION ALL
SELECT 10000002,20008,'2009-01-03',50.00



--	ASSUME: that the COUNT for this query is <= 10:
--		SELECT InvoiceNumber, COUNT(*) FROM @t GROUP BY InvoiceNumber
	
--	ASSUME: that the InvoiceNumbers are sequential even as they cross BatchNumbers,
--		and there are no missing InvoiceNumbers within a given sequence (e.g., if
--		we have InvoiceNumbers 1 and 5 (arbitrary numbers), then it will always be the
--		case that InvoiceNumbers 2, 3, and 4 exist in the base data set. If this 
--		assumption is violated for any reason, then we'll have to use a surrogate
--		key in place of this column.
--	DO NOT ASSUME that the InvoiceNumbers are unique across BatchNumbers. We CAN 
--		assume it based on the sample data, but the solution does not make this
--		assumption, and data received in the future need not rely on this.



-- instead of processing at the row level of the source data set, we're going to process
--	our results at the batch/invoice grain. This will immediately address the requirement
--	that invoices should not be broken across batches.
; WITH CntByInv AS (
	SELECT 
		BatchNumber,
		InvoiceNumber,
		Cnt = COUNT(*)
	FROM @t
	GROUP BY BatchNumber, InvoiceNumber
)

-- this CTE is the workhorse of the whole script. it discovers every possible batch of 10 rows or
--	less of sequential InvoiceNumbers (and because the InvoiceNumbers are already grouped together
--	and counted in a bunch, there's no chance of returning a sequence where an invoice is broken
--	across multiple batches). This CTE is a double self-join. First it picks an arbitrary
--	InvoiceNumber, then finds an InvoiceNumber with the same BatchNumber that is greater than
--	the first, and then it self-joins to count the number of rows between those two. If the sum
--	does not exceed 10, then the sequence is returned for possible consideration later.
,AllPossibleSets AS (
	SELECT 
		BatchNumber = C1.BatchNumber,
		BgnInvNo = C1.InvoiceNumber,
		EndInvNo = C2.InvoiceNumber,
		Cnt = (SELECT SUM(Cnt) FROM CntByInv C3 WHERE C3.BatchNumber = C1.BatchNumber AND C3.InvoiceNumber BETWEEN C1.InvoiceNumber AND C2.InvoiceNumber)
	FROM CntByInv C1
	JOIN CntByInv C2
		ON C2.BatchNumber = C1.BatchNumber
		AND C2.InvoiceNumber >= C1.InvoiceNumber
	WHERE (SELECT SUM(Cnt) FROM CntByInv C3 WHERE C3.BatchNumber = C1.BatchNumber AND C3.InvoiceNumber BETWEEN C1.InvoiceNumber AND C2.InvoiceNumber) <= 10
)

-- this just determines the largest sequence per BatchNumber/Beginning InvoiceNumber
,BiggestSets AS (
	SELECT 
		BatchNumber, 
		BgnInvNo, 
		MaxCnt = MAX(Cnt)
	FROM AllPossibleSets
	GROUP BY BatchNumber, BgnInvNo
)

-- for each batch, the first set must contain the min InvoiceNumber. This root InvoiceNumber 
--	will become the anchor of a recursive CTE later.
,BatchRoots AS (
	SELECT
		BatchNumber,
		RootInvNo = MIN(InvoiceNumber)
	FROM @t
	GROUP BY BatchNumber
)

-- this CTE simply pulls the ideal set of batch/invoice rows from AllPossibleSets that start with
--	the root InvoiceNumber
,BatchRoots2 AS (
	SELECT
		AllPossibleSets.BatchNumber,
		AllPossibleSets.BgnInvNo,
		AllPossibleSets.EndInvNo,
		AllPossibleSets.Cnt
	FROM BatchRoots
	JOIN BiggestSets 
		ON BiggestSets.BatchNumber = BatchRoots.BatchNumber
		AND BiggestSets.BgnInvNo = BatchRoots.RootInvNo
	JOIN AllPossibleSets
		ON AllPossibleSets.BatchNumber = BatchRoots.BatchNumber
		AND AllPossibleSets.BgnInvNo = BatchRoots.RootInvNo
		AND AllPossibleSets.Cnt = BiggestSets.MaxCnt
)

-- this recursive CTE first pulls the root InvoiceNumber identified in BatchRoots and BatchRoots2,
--	then pulls the next biggest set of data by adding 1 to the invoice number (which exposes a 
--	critical assumption - that the InvoiceNumbers are sequential within batches.
,BestGrouping AS (
	SELECT
		BatchNumber,
		BgnInvNo,
		EndInvNo,
		Cnt
	FROM BatchRoots2
	UNION ALL SELECT 
		AllPossibleSets.BatchNumber,
		AllPossibleSets.BgnInvNo,
		AllPossibleSets.EndInvNo,
		AllPossibleSets.Cnt
	FROM BestGrouping
	JOIN AllPossibleSets
		ON AllPossibleSets.BatchNumber = BestGrouping.BatchNumber
		AND AllPossibleSets.BgnInvNo = BestGrouping.EndInvNo + 1
	JOIN BiggestSets
		ON BiggestSets.MaxCnt = AllPossibleSets.Cnt
		AND BiggestSets.BatchNumber = AllPossibleSets.BatchNumber
		AND BiggestSets.BgnInvNo = AllPossibleSets.BgnInvNo
)

-- the icing on the cake - after BestGrouping pulls together the final data set, this CTE
--	adds the SetNo values, which will be appended to the original data set. The SetNo values
--	are just a ranking by beginning InvoiceNumber across BatchNumbers.
,BestGrouping2 AS (
	SELECT
		BatchNumber,
		BgnInvNo,
		EndInvNo,
		Cnt,
		SetNo = RANK() OVER (PARTITION BY BatchNumber ORDER BY BgnInvNo)
	FROM BestGrouping
)

SELECT 
	A.InvID,
	A.BatchNumber,
	A.InvoiceNumber,
	A.VisitDate,
	A.Amount,
	B.SetNo
FROM @t A
LEFT JOIN BestGrouping2 B
	ON B.BatchNumber = A.BatchNumber
	AND A.InvoiceNumber BETWEEN B.BgnInvNo AND B.EndInvNo

-- NOTE: test for success by counting the number of SetNo values that are null. There shouldn't be
--	any; if there are, then check for data that violates our assumptions. If the data checks out ok,
--	then there is a problem with our solution.