Merge branch 'master' into develop

ines · ines · commit 4c1a3ec7838f · 2019-11-21T21:01:12.000+01:00
diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py
@@ -33,6 +33,8 @@ def main(
         import cupy as xp
         import cupy.cuda.device
 
+        cupy.take_along_axis = take_along_axis
+        cupy.put_along_axis = put_along_axis
         device = cupy.cuda.device.Device(gpu_id)
         device.use()
     vectors_dir = Path(vectors)
@@ -56,35 +58,26 @@ def main(
     msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
     msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
     n = min(n_neighbors, vectors.shape[0])
+    subset = vectors[:cutoff]
     best_rows = xp.zeros((end - start, n), dtype="i")
     scores = xp.zeros((end - start, n), dtype="f")
-    # Pre-allocate this array, so we can use it each time.
-    subset = xp.ascontiguousarray(vectors[:cutoff])
-    sims = xp.zeros((batch_size, cutoff), dtype="f")
-    indices = xp.arange(cutoff).reshape((-1, 1))
     for i in tqdm.tqdm(list(range(start, end, batch_size))):
-        batch = vectors[i : i + batch_size]
-        # batch   e.g. (1024, 300)
-        # vectors e.g. (10000, 300)
-        # sims    e.g. (1024, 10000)
-        if batch.shape[0] == sims.shape[0]:
-            xp.dot(batch, subset.T, out=sims)
-        else:
-            # In the last batch we'll have a different size.
-            sims = xp.dot(batch, subset.T)
-        size = sims.shape[0]
-        # Get the indices and scores for the top N most similar for each in the
-        # batch. This is a bit complicated, to avoid sorting all of the scores
-        # -- we only want the top N to be sorted (which we do later). For now,
-        # we use argpartition to just get the cut point.
-        neighbors = xp.argpartition(sims, -n, axis=1)[:, -n:]
-        neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:]
-        # Can't figure out how to do this without the loop.
-        for j in range(min(end - i, size)):
-            # Sort in reverse order
-            indices = xp.argsort(neighbor_sims[j], axis=-1)[::-1]
-            best_rows[i + j] = xp.take(neighbors[j], indices)
-            scores[i + j] = xp.take(neighbor_sims[j], indices)
+        size = min(batch_size, end - i)
+        batch = vectors[i : i + size]
+        sims = xp.dot(batch, subset.T)
+        # Set self-similarities to -inf, so that we don't return them.
+        indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((-1, 1))
+        xp.put_along_axis(sims, indices, -xp.inf, axis=1)
+        # This used to use argpartition, to do a partial sort...But this ended
+        # up being a ratsnest of terrible numpy crap. Just sorting the whole
+        # list isn't really slower, and it's much simpler to read.
+        ranks = xp.argsort(sims, axis=1)
+        batch_rows = ranks[:, -n:]
+        # Reverse
+        batch_rows = batch_rows[:, ::-1]
+        batch_scores = xp.take_along_axis(sims, batch_rows, axis=1)
+        best_rows[i : i + size] = batch_rows
+        scores[i : i + size] = batch_scores
     msg.info("Saving output")
     if not isinstance(best_rows, numpy.ndarray):
         best_rows = best_rows.get()
@@ -103,6 +96,81 @@ def main(
     msg.good(f"Saved cache to {output_file}")
 
 
+# These functions are missing from cupy, but will be supported in cupy 7.
+def take_along_axis(a, indices, axis):
+    """Take values from the input array by matching 1d index and data slices.
+
+    Args:
+        a (cupy.ndarray): Array to extract elements.
+        indices (cupy.ndarray): Indices to take along each 1d slice of ``a``.
+        axis (int): The axis to take 1d slices along.
+
+    Returns:
+        cupy.ndarray: The indexed result.
+
+    .. seealso:: :func:`numpy.take_along_axis`
+    """
+    import cupy
+
+    if indices.dtype.kind not in ("i", "u"):
+        raise IndexError("`indices` must be an integer array")
+
+    if axis is None:
+        a = a.ravel()
+        axis = 0
+
+    ndim = a.ndim
+
+    if not (-ndim <= axis < ndim):
+        raise IndexError("Axis overrun")
+
+    axis %= a.ndim
+
+    if ndim != indices.ndim:
+        raise ValueError("`indices` and `a` must have the same number of dimensions")
+
+    fancy_index = []
+    for i, n in enumerate(a.shape):
+        if i == axis:
+            fancy_index.append(indices)
+        else:
+            ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1)
+            fancy_index.append(cupy.arange(n).reshape(ind_shape))
+
+    return a[fancy_index]
+
+
+def put_along_axis(a, indices, value, axis):
+    import cupy
+
+    if indices.dtype.kind not in ("i", "u"):
+        raise IndexError("`indices` must be an integer array")
+
+    if axis is None:
+        a = a.ravel()
+        axis = 0
+
+    ndim = a.ndim
+
+    if not (-ndim <= axis < ndim):
+        raise IndexError("Axis overrun")
+
+    axis %= a.ndim
+
+    if ndim != indices.ndim:
+        raise ValueError("`indices` and `a` must have the same number of dimensions")
+
+    fancy_index = []
+    for i, n in enumerate(a.shape):
+        if i == axis:
+            fancy_index.append(indices)
+        else:
+            ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1)
+            fancy_index.append(cupy.arange(n).reshape(ind_shape))
+
+    a[fancy_index] = value
+
+
 if __name__ == "__main__":
     try:
         plac.call(main)
diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py
@@ -30,6 +30,7 @@ def __init__(
         RETURNS (Sense2Vec): The newly constructed object.
         """
         self.vectors = Vectors(shape=shape, name=vectors_name)
+        self._row2key = None
         self.strings = StringStore() if strings is None else strings
         self.freqs: Dict[int, int] = {}
         self.cache = None
@@ -87,6 +88,7 @@ def __setitem__(self, key: Union[str, int], vector: numpy.ndarray):
         if key not in self.vectors:
             raise ValueError(f"Can't find key {key} in table")
         self.vectors[key] = vector
+        self._row2key = None
 
     def __iter__(self):
         """YIELDS (tuple): String key and vector pairs in the table."""
@@ -106,6 +108,12 @@ def values(self):
         """YIELDS (numpy.ndarray): The vectors in the table."""
         yield from self.vectors.values()
 
+    @property
+    def row2key(self):
+        if not self._row2key:
+            self._row2key = {row: key for key, row in self.vectors.key2row.items()}
+        return self._row2key
+
     @property
     def make_key(self) -> Callable:
         """Get the function to make keys."""
@@ -128,6 +136,7 @@ def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None):
         self.vectors.add(key, vector=vector)
         if freq is not None:
             self.set_freq(key, freq)
+        self._row2key = None
 
     def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]:
         """Get the frequency count for a given key.
@@ -200,31 +209,32 @@ def most_similar(
         """
         if isinstance(keys, (str, int)):
             keys = [keys]
-        # Always ask for more because we'll always get the keys themselves
-        n_similar = n + len(keys)
         for key in keys:
             if key not in self:
                 raise ValueError(f"Can't find key {key} in table")
-        if len(self.vectors) < n_similar:
-            n_similar = len(self.vectors)
-        if self.cache:
-            indices = self.cache.get("indices", [])
-            scores = self.cache.get("scores", [])
-            if len(indices) >= n_similar:
-                key_row = self.vectors.find(key=key)
-                sim_keys = self.vectors.find(rows=indices[key_row][:n_similar])
-                sim_scores = scores[key_row][:n_similar]
-                result = [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)]
-                return [(key, score) for key, score in result if key not in keys]
-        vecs = numpy.vstack([self[key] for key in keys])
-        average = vecs.mean(axis=0, keepdims=True)
-        result_keys, _, scores = self.vectors.most_similar(
-            average, n=n_similar, batch_size=batch_size
-        )
-        result = list(zip(result_keys.flatten(), scores.flatten()))
-        result = [(self.strings[key], score) for key, score in result if key]
-        result = [(key, score) for key, score in result if key not in keys]
-        return result
+        if self.cache and self.cache["indices"].shape[1] >= n:
+            n = min(len(self.vectors), n)
+            key = self.ensure_int_key(key)
+            key_row = self.vectors.find(key=key)
+            rows = self.cache["indices"][key_row, :n]
+            scores = self.cache["indices"][key_row, :n]
+            keys = [self.row2key[r] for r in rows]
+            keys = [self.strings[k] for k in keys]
+            assert len(keys) == len(scores)
+            return list(zip(keys, scores))
+        else:
+            # Always ask for more because we'll always get the keys themselves
+            n = min(len(self.vectors), n + len(keys))
+            rows = numpy.asarray(self.vectors.find(keys=keys))
+            vecs = self.vectors.data[rows]
+            average = vecs.mean(axis=0, keepdims=True)
+            result_keys, _, scores = self.vectors.most_similar(
+                average, n=n, batch_size=batch_size
+            )
+            result = list(zip(result_keys.flatten(), scores.flatten()))
+            result = [(self.strings[key], score) for key, score in result if key]
+            result = [(key, score) for key, score in result if key not in keys]
+            return result
 
     def get_other_senses(
         self, key: Union[str, int], ignore_case: bool = True
@@ -302,6 +312,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
             self.strings = StringStore().from_bytes(data["strings"])
         if "cache" not in exclude and "cache" in data:
             self.cache = data.get("cache", {})
+        self._row2key = None
         return self
 
     def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
@@ -338,4 +349,5 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
             self.strings = StringStore().from_disk(strings_path)
         if "cache" not in exclude and cache_path.exists():
             self.cache = srsly.read_msgpack(cache_path)
+        self._row2key = None
         return self
diff --git a/tests/data/cache b/tests/data/cache
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -25,8 +25,10 @@ def test_model_most_similar_cache(s2v):
     # Modify cache to test that the cache is used and values aren't computed
     query_row = s2v.vectors.find(key=s2v.ensure_int_key(query))
     scores = numpy.array(s2v.cache["scores"], copy=True)  # otherwise not writable
-    scores[query_row, 1] = 2.0
-    scores[query_row, 2] = 3.0
+    honey_bees_row = s2v.vectors.find(key="honey_bees|NOUN")
+    scores[query_row, honey_bees_row] = 2.0
+    beekeepers_row = s2v.vectors.find(key="Beekepers|NOUN")
+    scores[query_row, beekeepers_row] = 3.0
     s2v.cache["scores"] = scores
     ((key1, score1), (key2, score2)) = s2v.most_similar([query], n=2)
     assert key1 == "honey_bees|NOUN"