twitter
diff --git a/‎algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/ReservoirSamplingBenchmark.scala
Lines changed: 46 additions & 0 deletions b/‎algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/ReservoirSamplingBenchmark.scala
Lines changed: 46 additions & 0 deletions
diff --git a/‎algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
Lines changed: 5 additions & 6 deletions b/‎algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
Lines changed: 5 additions & 6 deletions
diff --git a/‎algebird-core/src/main/scala/com/twitter/algebird/mutable/ReservoirSampling.scala
Lines changed: 259 additions & 0 deletions b/‎algebird-core/src/main/scala/com/twitter/algebird/mutable/ReservoirSampling.scala
Lines changed: 259 additions & 0 deletions
@@ -0,0 +1,46 @@
+package com.twitter.algebird.benchmark
+
+import com.twitter.algebird.mutable.ReservoirSamplingToListAggregator
+import com.twitter.algebird.{Aggregator, Preparer}
+import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
+import org.openjdk.jmh.infra.Blackhole
+
+import scala.util.Random
+
+object ReservoirSamplingBenchmark {
+  @State(Scope.Benchmark)
+  class BenchmarkState {
+    @Param(Array("100", "10000", "1000000"))
+    var collectionSize: Int = 0
+
+    @Param(Array("0.001", "0.01", "0.1"))
+    var sampleRate: Double = 0.0
+
+    def samples: Int = (sampleRate * collectionSize).ceil.toInt
+  }
+
+  val rng = new Random()
+  implicit val randomSupplier: () => Random = () => rng
+}
+
+class ReservoirSamplingBenchmark {
+  import ReservoirSamplingBenchmark._
+
+  private def prioQueueSampler[T](count: Int) =
+    Preparer[T]
+      .map(rng.nextDouble() -> _)
+      .monoidAggregate(Aggregator.sortByTake(count)(_._1))
+      .andThenPresent(_.map(_._2))
+
+  @Benchmark
+  def timeAlgorithmL(state: BenchmarkState, bh: Blackhole): Unit =
+    bh.consume(new ReservoirSamplingToListAggregator[Int](state.samples).apply(0 until state.collectionSize))
+
+  @Benchmark
+  def timeAlgorithmLSeq(state: BenchmarkState, bh: Blackhole): Unit =
+    bh.consume(new ReservoirSamplingToListAggregator[Int](state.samples).apply((0 until state.collectionSize).asInstanceOf[Seq[Int]]))
+
+  @Benchmark
+  def timePriorityQeueue(state: BenchmarkState, bh: Blackhole): Unit =
+    bh.consume(prioQueueSampler(state.samples).apply(0 until state.collectionSize))
+}
@@ -1,5 +1,7 @@
 package com.twitter.algebird
 
+import com.twitter.algebird.mutable.{Reservoir, ReservoirSamplingToListAggregator}
+
 import java.util.PriorityQueue
 import scala.collection.compat._
 import scala.collection.generic.CanBuildFrom
@@ -286,12 +288,9 @@ object Aggregator extends java.io.Serializable {
   def reservoirSample[T](
       count: Int,
       seed: Int = DefaultSeed
-  ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
-    val rng = new java.util.Random(seed)
-    Preparer[T]
-      .map(rng.nextDouble() -> _)
-      .monoidAggregate(sortByTake(count)(_._1))
-      .andThenPresent(_.map(_._2))
+  ): MonoidAggregator[T, Reservoir[T], Seq[T]] = {
+    val rng = new scala.util.Random(seed)
+    new ReservoirSamplingToListAggregator[T](count)(() => rng)
   }
 
   /**
 
@@ -0,0 +1,259 @@
+package com.twitter.algebird.mutable
+
+import com.twitter.algebird.{Monoid, MonoidAggregator}
+
+import scala.collection.mutable
+import scala.util.Random
+
+/**
+ * A reservoir of the currently sampled items.
+ *
+ * @param capacity
+ *   the reservoir capacity
+ * @tparam T
+ *   the element type
+ */
+sealed class Reservoir[T](val capacity: Int) {
+  var reservoir: mutable.ArrayBuffer[T] = new mutable.ArrayBuffer
+
+  // When the reservoir is full, w is the threshold for accepting an element into the reservoir, and
+  // the following invariant holds: The maximum score of the elements in the reservoir is w,
+  // and the remaining elements are distributed as U[0, w].
+  // Scores are not kept explicitly, only their distribution is tracked and sampled from.
+  // (w = 1 when the reservoir is not full.)
+  var w: Double = 1
+
+  require(capacity > 0, "reservoir size must be positive")
+  private val kInv: Double = 1d / capacity
+
+  def size: Int = reservoir.size
+  def isEmpty: Boolean = reservoir.isEmpty
+  def isFull: Boolean = size == capacity
+
+  /**
+   * Add an element to the reservoir. If the reservoir is full then the element will replace a random element
+   * in the reservoir, and the threshold <pre>w</pre> is updated.
+   *
+   * When adding multiple elements, [[append]] should be used to take advantage of exponential jumps.
+   *
+   * @param x
+   *   the element to add
+   * @param rng
+   *   the random source
+   */
+  def accept(x: T, rng: Random): Unit = {
+    if (isFull) {
+      reservoir(rng.nextInt(capacity)) = x
+    } else {
+      reservoir.append(x)
+    }
+    if (isFull) {
+      w *= Math.pow(rng.nextDouble, kInv)
+    }
+  }
+
+  // The number of items to skip before accepting the next item is geometrically distributed
+  // with probability of success w / prior. The prior will be 1 when adding to a single reservoir,
+  // but when merging reservoirs it will be the threshold of the reservoir being pulled from,
+  // and in this case we require that w < prior.
+  private def nextAcceptTime(rng: Random, prior: Double = 1.0): Int =
+    (-rng.self.nextExponential / Math.log1p(-w / prior)).toInt
+
+  /**
+   * Add multiple elements to the reservoir.
+   * @param xs
+   *   the elements to add
+   * @param rng
+   *   the random source
+   * @param prior
+   *   the threshold of the elements being added, such that the added element's value is distributed as
+   *   <pre>U[0, prior]</pre>
+   * @return
+   *   this reservoir
+   */
+  def append(xs: TraversableOnce[T], rng: Random): Reservoir[T] = {
+    var skip = if (isFull) nextAcceptTime(rng) else 0
+    for (x <- xs) {
+      if (!isFull) {
+        // keep adding while reservoir is not full
+        accept(x, rng)
+        if (isFull) {
+          skip = nextAcceptTime(rng)
+        }
+      } else if (skip > 0) {
+        skip -= 1
+      } else {
+        accept(x, rng)
+        skip = nextAcceptTime(rng)
+      }
+    }
+    this
+  }
+
+  /**
+   * Add multiple elements to the reservoir. This overload is optimized for indexed sequences, where we can
+   * skip over multiple indexes without accessing the elements.
+   *
+   * @param xs
+   *   the elements to add
+   * @param rng
+   *   the random source
+   * @param prior
+   *   the threshold of the elements being added, such that the added element's value is distributed as
+   *   <pre>U[0, prior]</pre>
+   * @return
+   *   this reservoir
+   */
+  def append(xs: IndexedSeq[T], rng: Random, prior: Double): Reservoir[T] = {
+    var i = xs.size.min(capacity - size)
+    for (j <- 0 until i) {
+      accept(xs(j), rng)
+    }
+    assert(isFull)
+
+    val end = xs.size
+    while (i >= 0 && i < end) {
+      i += nextAcceptTime(rng, prior)
+      // the addition can overflow, in which case i < 0
+      if (i >= 0 && i < end) {
+        // element enters the reservoir
+        reservoir(rng.nextInt(capacity)) = xs(i)
+        w *= Math.pow(rng.nextDouble, kInv)
+        i += 1
+      }
+    }
+    this
+  }
+
+  override def toString: String = s"Reservoir($capacity, $w, ${reservoir.toList})"
+}
+
+object Reservoir {
+  implicit def monoid[T](implicit randomSupplier: () => Random): Monoid[Reservoir[T]] =
+    new ReservoirMonoid()(randomSupplier)
+}
+
+/**
+ * This is the "Algorithm L" reservoir sampling algorithm [1], with modifications to act as a monoid by
+ * merging reservoirs.
+ *
+ * [1] Kim-Hung Li, "Reservoir-Sampling Algorithms of Time Complexity O(n(1+log(N/n)))", 1994
+ *
+ * @tparam T
+ *   the item type
+ */
+class ReservoirMonoid[T](implicit val randomSupplier: () => Random) extends Monoid[Reservoir[T]] {
+
+  /**
+   * Builds a reservoir with a single item.
+   *
+   * @param k
+   *   the reservoir capacity
+   * @param x
+   *   the item to add
+   * @return
+   */
+  def build(k: Int, x: T): Reservoir[T] = {
+    val r = new Reservoir[T](k)
+    r.accept(x, randomSupplier())
+    r
+  }
+
+  override def zero: Reservoir[T] = new Reservoir(1)
+  def zero(k: Int): Reservoir[T] = new Reservoir(k)
+  override def isNonZero(r: Reservoir[T]): Boolean = !r.isEmpty
+
+  /**
+   * Merge two reservoirs. NOTE: This mutates one or both of the reservoirs. They should not be used after
+   * this operation, except as the return value for further aggregation.
+   */
+  override def plus(left: Reservoir[T], right: Reservoir[T]): Reservoir[T] =
+    if (left.isEmpty) right
+    else if (left.size + right.size <= left.capacity) {
+      // the sum of the sizes is less than the reservoir size, so we can just merge
+      left.append(right.reservoir, randomSupplier())
+    } else {
+      val (s1, s2) = if (left.w < right.w) (left, right) else (right, left)
+      val rng = randomSupplier()
+      if (s2.isFull) {
+        // The highest score in s2 is w, and the other scores are distributed as U[0, w].
+        // Since s1.w < s2.w, we have to drop the single (sampled) element with the highest score
+        // unconditionally. The other elements enter the reservoir with probability s1.w / s2.w.
+        val i = rng.nextInt(s2.size)
+        s2.reservoir(i) = s2.reservoir.head
+        s1.append(s2.reservoir.drop(1), rng, s2.w)
+      } else {
+        s1.append(s2.reservoir, rng, 1.0)
+      }
+    }
+}
+
+/**
+ * An aggregator that uses reservoir sampling to sample k elements from a stream of items. Because the
+ * reservoir is mutable, it is a good idea to copy the result to an immutable view before using it, as is done
+ * by [[ReservoirSamplingToListAggregator]].
+ *
+ * The aggregator defines operations for [[IndexedSeq]]s that allow for more efficient aggregation, however
+ * care must be taken with methods such as [[composePrepare()]] which return a regular [[MonoidAggregator]]
+ * that loses this optimized behavior.
+ *
+ * @param k
+ *   the number of elements to sample
+ * @param randomSupplier
+ *   the random generator
+ * @tparam T
+ *   the item type
+ * @tparam C
+ *   the result type
+ */
+abstract class ReservoirSamplingAggregator[T, +C](k: Int)(implicit val randomSupplier: () => Random)
+    extends MonoidAggregator[T, Reservoir[T], C] {
+  override val monoid: ReservoirMonoid[T] = new ReservoirMonoid
+  override def prepare(x: T): Reservoir[T] = monoid.build(k, x)
+
+  override def apply(xs: TraversableOnce[T]): C = present(agg(xs))
+  def apply(xs: IndexedSeq[T]): C = present(agg(xs))
+
+  override def applyOption(inputs: TraversableOnce[T]): Option[C] =
+    if (inputs.isEmpty) None else Some(apply(inputs))
+
+  override def append(r: Reservoir[T], t: T): Reservoir[T] = r.append(Seq(t), randomSupplier())
+
+  override def appendAll(r: Reservoir[T], xs: TraversableOnce[T]): Reservoir[T] =
+    r.append(xs, randomSupplier())
+  def appendAll(r: Reservoir[T], xs: IndexedSeq[T]): Reservoir[T] =
+    r.append(xs, randomSupplier(), 1.0)
+
+  override def appendAll(xs: TraversableOnce[T]): Reservoir[T] = agg(xs)
+  def appendAll(xs: IndexedSeq[T]): Reservoir[T] = agg(xs)
+
+  private def agg(xs: TraversableOnce[T]): Reservoir[T] =
+    appendAll(monoid.zero(k), xs)
+  private def agg(xs: IndexedSeq[T]): Reservoir[T] =
+    appendAll(monoid.zero(k), xs)
+}
+
+class ReservoirSamplingToListAggregator[T](k: Int)(implicit randomSupplier: () => Random)
+    extends ReservoirSamplingAggregator[T, List[T]](k)(randomSupplier) {
+  override def present(r: Reservoir[T]): List[T] =
+    randomSupplier().shuffle(r.reservoir).toList
+
+  override def andThenPresent[D](f: List[T] => D): MonoidAggregator[T, Reservoir[T], D] =
+    new AndThenPresent(this, f)
+}
+
+/**
+ * Monoid that implements [[andThenPresent]] without ruining the optimized behavior of the aggregator.
+ */
+protected class AndThenPresent[-A, B, C, +D](val agg: MonoidAggregator[A, B, C], f: C => D)
+    extends MonoidAggregator[A, B, D] {
+  override val monoid: Monoid[B] = agg.monoid
+  override def prepare(a: A): B = agg.prepare(a)
+  override def present(b: B): D = f(agg.present(b))
+
+  override def apply(xs: TraversableOnce[A]): D = f(agg(xs))
+  override def applyOption(xs: TraversableOnce[A]): Option[D] = agg.applyOption(xs).map(f)
+  override def append(b: B, a: A): B = agg.append(b, a)
+  override def appendAll(b: B, as: TraversableOnce[A]): B = agg.appendAll(b, as)
+  override def appendAll(as: TraversableOnce[A]): B = agg.appendAll(as)
+}