Skip to content

Commit cc33460

Browse files
clockflycloud-fan
authored andcommitted
[SPARK-17188][SQL] Moves class QuantileSummaries to project catalyst for implementing percentile_approx
## What changes were proposed in this pull request? This is a sub-task of [SPARK-16283](https://issues.apache.org/jira/browse/SPARK-16283) (Implement percentile_approx SQL function), which moves class QuantileSummaries to project catalyst so that it can be reused when implementing aggregation function `percentile_approx`. ## How was this patch tested? This PR only does class relocation, class implementation is not changed. Author: Sean Zhong <[email protected]> Closes apache#14754 from clockfly/move_QuantileSummaries_to_catalyst.
1 parent d2b3d3e commit cc33460

File tree

3 files changed

+267
-251
lines changed

3 files changed

+267
-251
lines changed
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.util
19+
20+
import scala.collection.mutable.ArrayBuffer
21+
22+
import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats
23+
24+
/**
25+
* Helper class to compute approximate quantile summary.
26+
* This implementation is based on the algorithm proposed in the paper:
27+
* "Space-efficient Online Computation of Quantile Summaries" by Greenwald, Michael
28+
* and Khanna, Sanjeev. (http://dx.doi.org/10.1145/375663.375670)
29+
*
30+
* In order to optimize for speed, it maintains an internal buffer of the last seen samples,
31+
* and only inserts them after crossing a certain size threshold. This guarantees a near-constant
32+
* runtime complexity compared to the original algorithm.
33+
*
34+
* @param compressThreshold the compression threshold.
35+
* After the internal buffer of statistics crosses this size, it attempts to compress the
36+
* statistics together.
37+
* @param relativeError the target relative error.
38+
* It is uniform across the complete range of values.
39+
* @param sampled a buffer of quantile statistics.
40+
* See the G-K article for more details.
41+
* @param count the count of all the elements *inserted in the sampled buffer*
42+
* (excluding the head buffer)
43+
*/
44+
class QuantileSummaries(
45+
val compressThreshold: Int,
46+
val relativeError: Double,
47+
val sampled: Array[Stats] = Array.empty,
48+
val count: Long = 0L) extends Serializable {
49+
50+
// a buffer of latest samples seen so far
51+
private val headSampled: ArrayBuffer[Double] = ArrayBuffer.empty
52+
53+
import QuantileSummaries._
54+
55+
/**
56+
* Returns a summary with the given observation inserted into the summary.
57+
* This method may either modify in place the current summary (and return the same summary,
58+
* modified in place), or it may create a new summary from scratch it necessary.
59+
* @param x the new observation to insert into the summary
60+
*/
61+
def insert(x: Double): QuantileSummaries = {
62+
headSampled.append(x)
63+
if (headSampled.size >= defaultHeadSize) {
64+
this.withHeadBufferInserted
65+
} else {
66+
this
67+
}
68+
}
69+
70+
/**
71+
* Inserts an array of (unsorted samples) in a batch, sorting the array first to traverse
72+
* the summary statistics in a single batch.
73+
*
74+
* This method does not modify the current object and returns if necessary a new copy.
75+
*
76+
* @return a new quantile summary object.
77+
*/
78+
private def withHeadBufferInserted: QuantileSummaries = {
79+
if (headSampled.isEmpty) {
80+
return this
81+
}
82+
var currentCount = count
83+
val sorted = headSampled.toArray.sorted
84+
val newSamples: ArrayBuffer[Stats] = new ArrayBuffer[Stats]()
85+
// The index of the next element to insert
86+
var sampleIdx = 0
87+
// The index of the sample currently being inserted.
88+
var opsIdx: Int = 0
89+
while(opsIdx < sorted.length) {
90+
val currentSample = sorted(opsIdx)
91+
// Add all the samples before the next observation.
92+
while(sampleIdx < sampled.size && sampled(sampleIdx).value <= currentSample) {
93+
newSamples.append(sampled(sampleIdx))
94+
sampleIdx += 1
95+
}
96+
97+
// If it is the first one to insert, of if it is the last one
98+
currentCount += 1
99+
val delta =
100+
if (newSamples.isEmpty || (sampleIdx == sampled.size && opsIdx == sorted.length - 1)) {
101+
0
102+
} else {
103+
math.floor(2 * relativeError * currentCount).toInt
104+
}
105+
106+
val tuple = Stats(currentSample, 1, delta)
107+
newSamples.append(tuple)
108+
opsIdx += 1
109+
}
110+
111+
// Add all the remaining existing samples
112+
while(sampleIdx < sampled.size) {
113+
newSamples.append(sampled(sampleIdx))
114+
sampleIdx += 1
115+
}
116+
new QuantileSummaries(compressThreshold, relativeError, newSamples.toArray, currentCount)
117+
}
118+
119+
/**
120+
* Returns a new summary that compresses the summary statistics and the head buffer.
121+
*
122+
* This implements the COMPRESS function of the GK algorithm. It does not modify the object.
123+
*
124+
* @return a new summary object with compressed statistics
125+
*/
126+
def compress(): QuantileSummaries = {
127+
// Inserts all the elements first
128+
val inserted = this.withHeadBufferInserted
129+
assert(inserted.headSampled.isEmpty)
130+
assert(inserted.count == count + headSampled.size)
131+
val compressed =
132+
compressImmut(inserted.sampled, mergeThreshold = 2 * relativeError * inserted.count)
133+
new QuantileSummaries(compressThreshold, relativeError, compressed, inserted.count)
134+
}
135+
136+
private def shallowCopy: QuantileSummaries = {
137+
new QuantileSummaries(compressThreshold, relativeError, sampled, count)
138+
}
139+
140+
/**
141+
* Merges two (compressed) summaries together.
142+
*
143+
* Returns a new summary.
144+
*/
145+
def merge(other: QuantileSummaries): QuantileSummaries = {
146+
require(headSampled.isEmpty, "Current buffer needs to be compressed before merge")
147+
require(other.headSampled.isEmpty, "Other buffer needs to be compressed before merge")
148+
if (other.count == 0) {
149+
this.shallowCopy
150+
} else if (count == 0) {
151+
other.shallowCopy
152+
} else {
153+
// Merge the two buffers.
154+
// The GK algorithm is a bit unclear about it, but it seems there is no need to adjust the
155+
// statistics during the merging: the invariants are still respected after the merge.
156+
// TODO: could replace full sort by ordered merge, the two lists are known to be sorted
157+
// already.
158+
val res = (sampled ++ other.sampled).sortBy(_.value)
159+
val comp = compressImmut(res, mergeThreshold = 2 * relativeError * count)
160+
new QuantileSummaries(
161+
other.compressThreshold, other.relativeError, comp, other.count + count)
162+
}
163+
}
164+
165+
/**
166+
* Runs a query for a given quantile.
167+
* The result follows the approximation guarantees detailed above.
168+
* The query can only be run on a compressed summary: you need to call compress() before using
169+
* it.
170+
*
171+
* @param quantile the target quantile
172+
* @return
173+
*/
174+
def query(quantile: Double): Double = {
175+
require(quantile >= 0 && quantile <= 1.0, "quantile should be in the range [0.0, 1.0]")
176+
require(headSampled.isEmpty,
177+
"Cannot operate on an uncompressed summary, call compress() first")
178+
179+
if (quantile <= relativeError) {
180+
return sampled.head.value
181+
}
182+
183+
if (quantile >= 1 - relativeError) {
184+
return sampled.last.value
185+
}
186+
187+
// Target rank
188+
val rank = math.ceil(quantile * count).toInt
189+
val targetError = math.ceil(relativeError * count)
190+
// Minimum rank at current sample
191+
var minRank = 0
192+
var i = 1
193+
while (i < sampled.size - 1) {
194+
val curSample = sampled(i)
195+
minRank += curSample.g
196+
val maxRank = minRank + curSample.delta
197+
if (maxRank - targetError <= rank && rank <= minRank + targetError) {
198+
return curSample.value
199+
}
200+
i += 1
201+
}
202+
sampled.last.value
203+
}
204+
}
205+
206+
object QuantileSummaries {
207+
// TODO(tjhunter) more tuning could be done one the constants here, but for now
208+
// the main cost of the algorithm is accessing the data in SQL.
209+
/**
210+
* The default value for the compression threshold.
211+
*/
212+
val defaultCompressThreshold: Int = 10000
213+
214+
/**
215+
* The size of the head buffer.
216+
*/
217+
val defaultHeadSize: Int = 50000
218+
219+
/**
220+
* The default value for the relative error (1%).
221+
* With this value, the best extreme percentiles that can be approximated are 1% and 99%.
222+
*/
223+
val defaultRelativeError: Double = 0.01
224+
225+
/**
226+
* Statistics from the Greenwald-Khanna paper.
227+
* @param value the sampled value
228+
* @param g the minimum rank jump from the previous value's minimum rank
229+
* @param delta the maximum span of the rank.
230+
*/
231+
case class Stats(value: Double, g: Int, delta: Int)
232+
233+
private def compressImmut(
234+
currentSamples: IndexedSeq[Stats],
235+
mergeThreshold: Double): Array[Stats] = {
236+
if (currentSamples.isEmpty) {
237+
return Array.empty[Stats]
238+
}
239+
val res: ArrayBuffer[Stats] = ArrayBuffer.empty
240+
// Start for the last element, which is always part of the set.
241+
// The head contains the current new head, that may be merged with the current element.
242+
var head = currentSamples.last
243+
var i = currentSamples.size - 2
244+
// Do not compress the last element
245+
while (i >= 1) {
246+
// The current sample:
247+
val sample1 = currentSamples(i)
248+
// Do we need to compress?
249+
if (sample1.g + head.g + head.delta < mergeThreshold) {
250+
// Do not insert yet, just merge the current element into the head.
251+
head = head.copy(g = head.g + sample1.g)
252+
} else {
253+
// Prepend the current head, and keep the current sample as target for merging.
254+
res.prepend(head)
255+
head = sample1
256+
}
257+
i -= 1
258+
}
259+
res.prepend(head)
260+
// If necessary, add the minimum element:
261+
res.prepend(currentSamples.head)
262+
res.toArray
263+
}
264+
}

sql/core/src/test/scala/org/apache/spark/sql/execution/stat/ApproxQuantileSuite.scala renamed to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,13 @@
1515
* limitations under the License.
1616
*/
1717

18-
package org.apache.spark.sql.execution.stat
18+
package org.apache.spark.sql.catalyst.util
1919

2020
import scala.util.Random
2121

2222
import org.apache.spark.SparkFunSuite
23-
import org.apache.spark.sql.execution.stat.StatFunctions.QuantileSummaries
2423

25-
26-
class ApproxQuantileSuite extends SparkFunSuite {
24+
class QuantileSummariesSuite extends SparkFunSuite {
2725

2826
private val r = new Random(1)
2927
private val n = 100
@@ -125,5 +123,4 @@ class ApproxQuantileSuite extends SparkFunSuite {
125123
checkQuantile(0.001, data, s)
126124
}
127125
}
128-
129126
}

0 commit comments

Comments
 (0)