我有一个这样的样本表:
CREATE TABLE #TEMP(Category VARCHAR(100), Name VARCHAR(100))
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'John')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Adam')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Lisa')
INSERT INTO #TEMP VALUES('A', 'Bucky')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Lily')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Tom')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
INSERT INTO #TEMP VALUES('B', 'Ross')
SELECT Category, Name, COUNT(Name) Total
FROM #TEMP
GROUP BY Category, Name
ORDER BY Category, Total DESC
DROP TABLE #TEMP
给我以下内容:
A John 6
A Adam 4
A Lisa 2
A Bucky 1
B Lily 5
B Tom 4
B Ross 3
现在,我该如何选择 TOP 5 PERCENT
每个类别的记录 假设每个类别有超过100条记录(这里未显示样本表)?例如,在我的实际表中,它应该删除 John
来自 A
和 Lily
来自 B
如果合适(再次,我没有在这里显示完整的表)得到:
A Adam 4
A Lisa 2
A Bucky 1
B Tom 4
B Ross 3
我一直在尝试使用 CTE
s和 PARTITION BY
条款但似乎无法实现我想要的。它从总体结果中删除了TOP 5 PERCENT,但不从每个类别中删除。有什么建议么?
您可以使用与CTE配对的CTE(公用表表达式) NTILE
窗口函数 - 这会将您的数据切割成您需要的多个切片,例如在你的情况下,分为20片(每片5%)。
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
NTILE(20) OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) AS 'NTile'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE NTile > 1
这基本上将您的数据分组 Category,Name
,通过其他东西订购(不确定是否 COUNT(Name)
这真的是你想要的东西),然后把它分成20块,每块代表5%的数据分区。切片用 NTile = 1
是最高的5%切片 - 从CTE中选择时忽略它。
看到:
了解更多信息
编辑:我添加了第二个解决方案
SELECT b.Id
,b.Category
,b.Name
,b.CategoryNameCount
FROM
(
SELECT a.Id
,a.Category
,a.Name
,COUNT(*)OVER(PARTITION BY a.Category, a.Name) CategoryNameCount
,COUNT(*)OVER(PARTITION BY a.Category) CategoryCount
FROM #TEMP a
) b
WHERE b.CategoryCount*5.0/100 > b.CategoryCount*b.CategoryNameCount*1.0/100
ORDER BY b.Category, b.CategoryNameCount DESC, b.Name
结果:
Id Category Name CategoryNameCount
----------- -------- ---------- -----------------
7 A Adam 4
8 A Adam 4
9 A Adam 4
10 A Adam 4
11 A Lisa 2
12 A Lisa 2
13 A Bucky 1
19 B Tom 4
20 B Tom 4
21 B Tom 4
22 B Tom 4
23 B Ross 3
24 B Ross 3
25 B Ross 3
要么
SELECT b.Category, b.Name, b.CategoryNameCount
FROM
(
SELECT
a.Category
,a.Name
,COUNT(*)OVER(PARTITION BY a.Category, a.Name) CategoryNameCount
,COUNT(*)OVER(PARTITION BY a.Category) CategoryCount
FROM #TEMP a
) b
WHERE b.CategoryCount*5.0/100 > b.CategoryCount*b.CategoryNameCount*1.0/100
GROUP BY b.Category, b.Name, b.CategoryNameCount
ORDER BY b.Category, b.CategoryNameCount DESC, b.Name
结果:
Category Name CategoryNameCount
-------- ---------- -----------------
A Adam 4
A Lisa 2
A Bucky 1
B Tom 4
B Ross 3
select Category,name,CountTotal,RankSeq,(50*CountTotal)/100 from (
select Category,name,COUNT(*)
over (partition by Category,name ) as CountTotal,
ROW_NUMBER()
over (partition by Category,name order by Category) RankSeq from #TEMP
--group by Category,Name
) temp
where RankSeq <= ((50*CountTotal)/100)
order by Category,Name,RankSeq
输出:
Category name CountTotal RankSeq 50*CountTotal)/100
A Adam 4 1 2
A Adam 4 2 2
A John 6 1 3
A John 6 2 3
A John 6 3 3
A Lisa 2 1 1
B Lily 5 1 2
B Lily 5 2 2
B Ross 3 1 1
B Tom 4 1 2
B Tom 4 2 2
我希望这有帮助 :)
;WITH SlicedData AS
(
SELECT Category, Name, COUNT(Name) Total,
**PERCENT_RANK() OVER(PARTITION BY Category ORDER BY COUNT(Name) DESC) * 100** AS 'Percent'
FROM #TEMP
GROUP BY Category, Name
)
SELECT *
FROM SlicedData
WHERE Percent < 5
如果记录数小于您的图块编号,则NTile将不起作用。