|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package org.apache.spark.sql.catalyst.util |
| 19 | + |
| 20 | +import scala.collection.mutable.ArrayBuffer |
| 21 | + |
| 22 | +import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats |
| 23 | + |
| 24 | +/** |
| 25 | + * Helper class to compute approximate quantile summary. |
| 26 | + * This implementation is based on the algorithm proposed in the paper: |
| 27 | + * "Space-efficient Online Computation of Quantile Summaries" by Greenwald, Michael |
| 28 | + * and Khanna, Sanjeev. (http://dx.doi.org/10.1145/375663.375670) |
| 29 | + * |
| 30 | + * In order to optimize for speed, it maintains an internal buffer of the last seen samples, |
| 31 | + * and only inserts them after crossing a certain size threshold. This guarantees a near-constant |
| 32 | + * runtime complexity compared to the original algorithm. |
| 33 | + * |
| 34 | + * @param compressThreshold the compression threshold. |
| 35 | + * After the internal buffer of statistics crosses this size, it attempts to compress the |
| 36 | + * statistics together. |
| 37 | + * @param relativeError the target relative error. |
| 38 | + * It is uniform across the complete range of values. |
| 39 | + * @param sampled a buffer of quantile statistics. |
| 40 | + * See the G-K article for more details. |
| 41 | + * @param count the count of all the elements *inserted in the sampled buffer* |
| 42 | + * (excluding the head buffer) |
| 43 | + */ |
| 44 | +class QuantileSummaries( |
| 45 | + val compressThreshold: Int, |
| 46 | + val relativeError: Double, |
| 47 | + val sampled: Array[Stats] = Array.empty, |
| 48 | + val count: Long = 0L) extends Serializable { |
| 49 | + |
| 50 | + // a buffer of latest samples seen so far |
| 51 | + private val headSampled: ArrayBuffer[Double] = ArrayBuffer.empty |
| 52 | + |
| 53 | + import QuantileSummaries._ |
| 54 | + |
| 55 | + /** |
| 56 | + * Returns a summary with the given observation inserted into the summary. |
| 57 | + * This method may either modify in place the current summary (and return the same summary, |
| 58 | + * modified in place), or it may create a new summary from scratch it necessary. |
| 59 | + * @param x the new observation to insert into the summary |
| 60 | + */ |
| 61 | + def insert(x: Double): QuantileSummaries = { |
| 62 | + headSampled.append(x) |
| 63 | + if (headSampled.size >= defaultHeadSize) { |
| 64 | + this.withHeadBufferInserted |
| 65 | + } else { |
| 66 | + this |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + /** |
| 71 | + * Inserts an array of (unsorted samples) in a batch, sorting the array first to traverse |
| 72 | + * the summary statistics in a single batch. |
| 73 | + * |
| 74 | + * This method does not modify the current object and returns if necessary a new copy. |
| 75 | + * |
| 76 | + * @return a new quantile summary object. |
| 77 | + */ |
| 78 | + private def withHeadBufferInserted: QuantileSummaries = { |
| 79 | + if (headSampled.isEmpty) { |
| 80 | + return this |
| 81 | + } |
| 82 | + var currentCount = count |
| 83 | + val sorted = headSampled.toArray.sorted |
| 84 | + val newSamples: ArrayBuffer[Stats] = new ArrayBuffer[Stats]() |
| 85 | + // The index of the next element to insert |
| 86 | + var sampleIdx = 0 |
| 87 | + // The index of the sample currently being inserted. |
| 88 | + var opsIdx: Int = 0 |
| 89 | + while(opsIdx < sorted.length) { |
| 90 | + val currentSample = sorted(opsIdx) |
| 91 | + // Add all the samples before the next observation. |
| 92 | + while(sampleIdx < sampled.size && sampled(sampleIdx).value <= currentSample) { |
| 93 | + newSamples.append(sampled(sampleIdx)) |
| 94 | + sampleIdx += 1 |
| 95 | + } |
| 96 | + |
| 97 | + // If it is the first one to insert, of if it is the last one |
| 98 | + currentCount += 1 |
| 99 | + val delta = |
| 100 | + if (newSamples.isEmpty || (sampleIdx == sampled.size && opsIdx == sorted.length - 1)) { |
| 101 | + 0 |
| 102 | + } else { |
| 103 | + math.floor(2 * relativeError * currentCount).toInt |
| 104 | + } |
| 105 | + |
| 106 | + val tuple = Stats(currentSample, 1, delta) |
| 107 | + newSamples.append(tuple) |
| 108 | + opsIdx += 1 |
| 109 | + } |
| 110 | + |
| 111 | + // Add all the remaining existing samples |
| 112 | + while(sampleIdx < sampled.size) { |
| 113 | + newSamples.append(sampled(sampleIdx)) |
| 114 | + sampleIdx += 1 |
| 115 | + } |
| 116 | + new QuantileSummaries(compressThreshold, relativeError, newSamples.toArray, currentCount) |
| 117 | + } |
| 118 | + |
| 119 | + /** |
| 120 | + * Returns a new summary that compresses the summary statistics and the head buffer. |
| 121 | + * |
| 122 | + * This implements the COMPRESS function of the GK algorithm. It does not modify the object. |
| 123 | + * |
| 124 | + * @return a new summary object with compressed statistics |
| 125 | + */ |
| 126 | + def compress(): QuantileSummaries = { |
| 127 | + // Inserts all the elements first |
| 128 | + val inserted = this.withHeadBufferInserted |
| 129 | + assert(inserted.headSampled.isEmpty) |
| 130 | + assert(inserted.count == count + headSampled.size) |
| 131 | + val compressed = |
| 132 | + compressImmut(inserted.sampled, mergeThreshold = 2 * relativeError * inserted.count) |
| 133 | + new QuantileSummaries(compressThreshold, relativeError, compressed, inserted.count) |
| 134 | + } |
| 135 | + |
| 136 | + private def shallowCopy: QuantileSummaries = { |
| 137 | + new QuantileSummaries(compressThreshold, relativeError, sampled, count) |
| 138 | + } |
| 139 | + |
| 140 | + /** |
| 141 | + * Merges two (compressed) summaries together. |
| 142 | + * |
| 143 | + * Returns a new summary. |
| 144 | + */ |
| 145 | + def merge(other: QuantileSummaries): QuantileSummaries = { |
| 146 | + require(headSampled.isEmpty, "Current buffer needs to be compressed before merge") |
| 147 | + require(other.headSampled.isEmpty, "Other buffer needs to be compressed before merge") |
| 148 | + if (other.count == 0) { |
| 149 | + this.shallowCopy |
| 150 | + } else if (count == 0) { |
| 151 | + other.shallowCopy |
| 152 | + } else { |
| 153 | + // Merge the two buffers. |
| 154 | + // The GK algorithm is a bit unclear about it, but it seems there is no need to adjust the |
| 155 | + // statistics during the merging: the invariants are still respected after the merge. |
| 156 | + // TODO: could replace full sort by ordered merge, the two lists are known to be sorted |
| 157 | + // already. |
| 158 | + val res = (sampled ++ other.sampled).sortBy(_.value) |
| 159 | + val comp = compressImmut(res, mergeThreshold = 2 * relativeError * count) |
| 160 | + new QuantileSummaries( |
| 161 | + other.compressThreshold, other.relativeError, comp, other.count + count) |
| 162 | + } |
| 163 | + } |
| 164 | + |
| 165 | + /** |
| 166 | + * Runs a query for a given quantile. |
| 167 | + * The result follows the approximation guarantees detailed above. |
| 168 | + * The query can only be run on a compressed summary: you need to call compress() before using |
| 169 | + * it. |
| 170 | + * |
| 171 | + * @param quantile the target quantile |
| 172 | + * @return |
| 173 | + */ |
| 174 | + def query(quantile: Double): Double = { |
| 175 | + require(quantile >= 0 && quantile <= 1.0, "quantile should be in the range [0.0, 1.0]") |
| 176 | + require(headSampled.isEmpty, |
| 177 | + "Cannot operate on an uncompressed summary, call compress() first") |
| 178 | + |
| 179 | + if (quantile <= relativeError) { |
| 180 | + return sampled.head.value |
| 181 | + } |
| 182 | + |
| 183 | + if (quantile >= 1 - relativeError) { |
| 184 | + return sampled.last.value |
| 185 | + } |
| 186 | + |
| 187 | + // Target rank |
| 188 | + val rank = math.ceil(quantile * count).toInt |
| 189 | + val targetError = math.ceil(relativeError * count) |
| 190 | + // Minimum rank at current sample |
| 191 | + var minRank = 0 |
| 192 | + var i = 1 |
| 193 | + while (i < sampled.size - 1) { |
| 194 | + val curSample = sampled(i) |
| 195 | + minRank += curSample.g |
| 196 | + val maxRank = minRank + curSample.delta |
| 197 | + if (maxRank - targetError <= rank && rank <= minRank + targetError) { |
| 198 | + return curSample.value |
| 199 | + } |
| 200 | + i += 1 |
| 201 | + } |
| 202 | + sampled.last.value |
| 203 | + } |
| 204 | +} |
| 205 | + |
| 206 | +object QuantileSummaries { |
| 207 | + // TODO(tjhunter) more tuning could be done one the constants here, but for now |
| 208 | + // the main cost of the algorithm is accessing the data in SQL. |
| 209 | + /** |
| 210 | + * The default value for the compression threshold. |
| 211 | + */ |
| 212 | + val defaultCompressThreshold: Int = 10000 |
| 213 | + |
| 214 | + /** |
| 215 | + * The size of the head buffer. |
| 216 | + */ |
| 217 | + val defaultHeadSize: Int = 50000 |
| 218 | + |
| 219 | + /** |
| 220 | + * The default value for the relative error (1%). |
| 221 | + * With this value, the best extreme percentiles that can be approximated are 1% and 99%. |
| 222 | + */ |
| 223 | + val defaultRelativeError: Double = 0.01 |
| 224 | + |
| 225 | + /** |
| 226 | + * Statistics from the Greenwald-Khanna paper. |
| 227 | + * @param value the sampled value |
| 228 | + * @param g the minimum rank jump from the previous value's minimum rank |
| 229 | + * @param delta the maximum span of the rank. |
| 230 | + */ |
| 231 | + case class Stats(value: Double, g: Int, delta: Int) |
| 232 | + |
| 233 | + private def compressImmut( |
| 234 | + currentSamples: IndexedSeq[Stats], |
| 235 | + mergeThreshold: Double): Array[Stats] = { |
| 236 | + if (currentSamples.isEmpty) { |
| 237 | + return Array.empty[Stats] |
| 238 | + } |
| 239 | + val res: ArrayBuffer[Stats] = ArrayBuffer.empty |
| 240 | + // Start for the last element, which is always part of the set. |
| 241 | + // The head contains the current new head, that may be merged with the current element. |
| 242 | + var head = currentSamples.last |
| 243 | + var i = currentSamples.size - 2 |
| 244 | + // Do not compress the last element |
| 245 | + while (i >= 1) { |
| 246 | + // The current sample: |
| 247 | + val sample1 = currentSamples(i) |
| 248 | + // Do we need to compress? |
| 249 | + if (sample1.g + head.g + head.delta < mergeThreshold) { |
| 250 | + // Do not insert yet, just merge the current element into the head. |
| 251 | + head = head.copy(g = head.g + sample1.g) |
| 252 | + } else { |
| 253 | + // Prepend the current head, and keep the current sample as target for merging. |
| 254 | + res.prepend(head) |
| 255 | + head = sample1 |
| 256 | + } |
| 257 | + i -= 1 |
| 258 | + } |
| 259 | + res.prepend(head) |
| 260 | + // If necessary, add the minimum element: |
| 261 | + res.prepend(currentSamples.head) |
| 262 | + res.toArray |
| 263 | + } |
| 264 | +} |
0 commit comments