summaryrefslogtreecommitdiff
path: root/core/java/com/android/internal/util/HeavyHitterSketch.java
blob: e18acaf3679774aa550da496283c0d27a86c5b47 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*
 * Copyright (C) 2020 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.internal.util;

import android.annotation.NonNull;
import android.annotation.Nullable;
import android.util.SparseArray;
import android.util.SparseIntArray;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * <p>A utility which processes a sequence of input (stream) and output the heavy hitters
 * (the frequent ones).</p>
 *
 * @param <T> The type of the input.
 * @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for
 * the definion of heavy hitters and the list of algorithms for detecting it.
 * <p>
 * {@hide}
 */
public interface HeavyHitterSketch<T> {
    /**
     * Return the default implementation.
     *
     * @return The default implementation.
     */
    static <V> @NonNull HeavyHitterSketch<V> newDefault() {
        return new HeavyHitterSketchImpl<V>();
    }

    /**
     * Set the configuration with given parameters
     *
     * @param inputSize The amount of the input.
     * @param capacity  The maximum number of distinct input it should track; it defines the lower
     *                  bound of the output.
     */
    void setConfig(int inputSize, int capacity);

    /**
     * Add a new input to the current sketch.
     *
     * @param newInstance The new input
     */
    void add(@Nullable T newInstance);

    /**
     * @param k      The number of heavy hitters it should return, k &lt; capacity, a value of 0
     *               will be equivalent to capacity - 1
     * @param holder The list array into which the elements of the tops are to be stored; a new list
     *               would be created and returned if this parameter is null.
     * @param freqs  Optional, the frequencies of each items in the returned result
     * @return The top K heavy hitters(frequent input)
     */
    @Nullable
    List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs);

    /**
     * @param holder The list array into which the elements of the candidates are to be stored; a
     *               new list would be created and returned if this parameter is null.
     * @return The candidate heavy hitters so far, it could include false postives.
     */
    @Nullable
    List<T> getCandidates(@Nullable List<T> holder);

    /**
     * Reset this heavy hitter counter
     */
    void reset();

    /**
     * @return The ratio of the input to be used as the validation data, Float.NaN means no
     * validation is needed.
     */
    float getRequiredValidationInputRatio();

    /**
     * The default implementation of the {@link HeavyHitterSketch}.
     *
     * <p>Currently it consists of two passes: the first pass will take the input into
     * the MG(Misra–Gries) summary; while the secondary pass will validate the output against the
     * input in order to eliminate false postivies.</p>
     *
     * <p>For sure there are better approaches which takes only one pass, but also comes along with
     * overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough
     * pick.</p>
     *
     * @param <T> The type of the input.
     * @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries
     * summary</a> for the detailed explanation of the algorithm.
     */
    final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> {
        /**
         * The array to track the current heavy hitters, its size &lt; {@link #mCapacity}.
         */
        private final SparseArray<T> mObjects = new SparseArray<>();

        /**
         * The frequencies of the current heavy hitters, its size &lt; {@link #mCapacity}.
         */
        private final SparseIntArray mFrequencies = new SparseIntArray();

        /**
         * The amount of the input of each pass
         */
        private int mPassSize;

        /**
         * The amount of the total input it expects
         */
        private int mTotalSize;

        /**
         * The maximum number of distinct input it should track
         */
        private int mCapacity;

        /**
         * The amount of inputs it already received.
         */
        private int mNumInputs;

        /**
         * Whether or not it's configured properly.
         */
        private boolean mConfigured;

        /**
         * Set the configuration with given parameters
         *
         * @param inputSize The amount of the input.
         * @param capacity  The maximum number of distinct input it should track; it defines the
         *                  lower bound of the output.
         */
        public void setConfig(final int inputSize, final int capacity) {
            if (inputSize < capacity || inputSize <= 1) {
                mConfigured = false;
                throw new IllegalArgumentException();
            }
            reset();
            mTotalSize = inputSize;
            mPassSize = inputSize >> 1;
            mCapacity = capacity;
            mConfigured = true;
        }

        /**
         * Add a new input to the current sketch.
         *
         * @param newInstance The new input
         */
        @Override
        public void add(@Nullable final T newInstance) {
            if (!mConfigured) {
                throw new IllegalStateException();
            }
            if (mNumInputs < mPassSize) {
                addToMGSummary(newInstance);
            } else if (mNumInputs < mTotalSize) {
                // Validation pass
                validate(newInstance);
            }
        }

        /**
         * Add an input to the MG summary.
         *
         * <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair
         * in the result set, will have the following property:
         * <code>(freq - inputSize / capacity) &le; freq' &le; freq</code>
         * The above freq' is the estimated frequency, while the freq is the actual frequency.
         * </p>
         */
        private void addToMGSummary(@Nullable final T newInstance) {
            final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
            final int index = mObjects.indexOfKey(hashCode);
            // MG summary
            if (index >= 0) {
                mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
            } else if (mObjects.size() < mCapacity - 1) {
                mObjects.put(hashCode, newInstance);
                mFrequencies.put(hashCode, 1);
            } else {
                for (int i = mFrequencies.size() - 1; i >= 0; i--) {
                    final int val = mFrequencies.valueAt(i) - 1;
                    if (val == 0) {
                        mObjects.removeAt(i);
                        mFrequencies.removeAt(i);
                    } else {
                        mFrequencies.setValueAt(i, val);
                    }
                }
            }
            if (++mNumInputs == mPassSize) {
                // Clear all the frequencies as we are going to validate them next
                for (int i = mFrequencies.size() - 1; i >= 0; i--) {
                    mFrequencies.setValueAt(i, 0);
                }
            }
        }

        /**
         * Validate the results from MG summary; the ones with frequencies less than lower boundary
         * will be removed from the set.
         */
        private void validate(@Nullable final T newInstance) {
            final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
            final int index = mObjects.indexOfKey(hashCode);
            if (index >= 0) {
                mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
            }
            if (++mNumInputs == mTotalSize) {
                final int lower = mPassSize / mCapacity;
                // Remove any occurrences with frequencies less than lower boundary
                for (int i = mFrequencies.size() - 1; i >= 0; i--) {
                    final int val = mFrequencies.valueAt(i);
                    if (val < lower) {
                        mFrequencies.removeAt(i);
                        mObjects.removeAt(i);
                    }
                }
            }
        }

        /**
         * @param k      The number of heavy hitters it should return, k &lt; capacity, a value of 0
         *               will be equivalent to capacity - 1
         * @param holder The list array into which the elements of the tops are to be stored; a new
         *               list would be created and returned if this parameter is null.
         * @param freqs  Optional, the frequencies of each items in the returned result
         * @return The top K heavy hitters(frequent input)
         */
        @Override
        @Nullable
        public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder,
                final @Nullable List<Float> freqs) {
            if (!mConfigured) {
                throw new IllegalStateException();
            }

            if (k >= mCapacity) {
                throw new IllegalArgumentException();
            }

            if (mNumInputs < mTotalSize) {
                // It hasn't had all the inputs yet.
                throw new IllegalStateException();
            }

            ArrayList<Integer> indexes = null;
            for (int i = mFrequencies.size() - 1; i >= 0; i--) {
                final int val = mFrequencies.valueAt(i);
                if (val > 0) {
                    if (indexes == null) {
                        indexes = new ArrayList<>();
                    }
                    indexes.add(i);
                }
            }
            if (indexes == null) {
                return null;
            }

            Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a));

            final List<T> result = holder != null ? holder : new ArrayList<T>();
            final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size());
            for (int i = 0; i < max; i++) {
                final int index = indexes.get(i);
                final T obj = mObjects.valueAt(index);
                if (obj != null) {
                    result.add(obj);
                    if (freqs != null) {
                        freqs.add((float) mFrequencies.valueAt(index) / mPassSize);
                    }
                }
            }
            return result;
        }

        /**
         * @param holder The list array into which the elements of the candidates are to be stored;
         *               a new list would be created and returned if this parameter is null.
         * @return The candidate heavy hitters so far, it could include false postives.
         */
        @Nullable
        public List<T> getCandidates(final @Nullable List<T> holder) {
            if (!mConfigured) {
                throw new IllegalStateException();
            }
            if (mNumInputs < mPassSize) {
                // It hasn't done with the first pass yet, return nothing
                return null;
            }

            List<T> result = holder != null ? holder : new ArrayList<T>();
            for (int i = mObjects.size() - 1; i >= 0; i--) {
                final T obj = mObjects.valueAt(i);
                if (obj != null) {
                    result.add(obj);
                }
            }
            return result;
        }

        /**
         * Reset this heavy hitter counter
         */
        @Override
        public void reset() {
            mNumInputs = 0;
            mObjects.clear();
            mFrequencies.clear();
        }

        /**
         * @return The ratio of the input to be used as the validation data, Float.NaN means no
         * validation is needed.
         */
        public float getRequiredValidationInputRatio() {
            return 0.5f;
        }
    }
}