|
| 1 | +# paddle.incubate.sparse.transpose 设计文档 |
| 2 | + |
| 3 | +| API名称 | paddle.incubate.sparse.transpose | |
| 4 | +|----------------------------------------------------------|-----------------------------------------------| |
| 5 | +| 提交作者<input type="checkbox" class="rowselector hidden"> | 六个骨头 | |
| 6 | +| 提交时间<input type="checkbox" class="rowselector hidden"> | 2022-09-07 | |
| 7 | +| 版本号 | V1.0 | |
| 8 | +| 依赖飞桨版本<input type="checkbox" class="rowselector hidden"> | develop | |
| 9 | +| 文件名 | 20220907_api_design_for_sparse_transpose.md<br> | |
| 10 | + |
| 11 | +# 一、概述 |
| 12 | + |
| 13 | +## 1、相关背景 |
| 14 | + |
| 15 | +为了提升飞桨API丰富度,针对 Paddle 的两种稀疏 Tensor 格式 COO 与 CSR ,都需新增 transpose 的计算逻辑, |
| 16 | +一共需要新增 2个 kernel 的前向与反向,其中 CSR 的 kernel 需支持 2D/3D Tensor,COO 的 kernel 需支持任意维度的 Tensor。 |
| 17 | + |
| 18 | +## 3、意义 |
| 19 | + |
| 20 | +支持稀疏tensor的transpose操作,丰富基础功能,提升稀疏tensor的API完整度。 |
| 21 | + |
| 22 | +# 二、飞桨现状 |
| 23 | + |
| 24 | +目前paddle缺少相关功能实现。 |
| 25 | + |
| 26 | +# 三、业内方案调研 |
| 27 | + |
| 28 | +## Pytorch |
| 29 | + |
| 30 | +Pytorch中相关实现如下 |
| 31 | + |
| 32 | +```c |
| 33 | +static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) { |
| 34 | + int64_t nsparse_dim = self.sparse_dim(); |
| 35 | + TORCH_CHECK(dim0 < nsparse_dim && dim1 < nsparse_dim, |
| 36 | + "sparse transpose: transposed dimensions must be sparse ", |
| 37 | + "Got sparse_dim: ", nsparse_dim, ", d0: ", dim0, ", d1: ", dim1); |
| 38 | + |
| 39 | + if (self._indices().numel() == 0 && self._values().numel() == 0) { |
| 40 | + auto sizes = self.sizes().vec(); |
| 41 | + std::swap(sizes[dim0], sizes[dim1]); |
| 42 | + |
| 43 | + at::sparse::get_sparse_impl(self)->raw_resize_(self.sparse_dim(), self.dense_dim(), sizes); |
| 44 | + } else { |
| 45 | + auto indices = self._indices(); |
| 46 | + auto row0 = indices.select(0, dim0); |
| 47 | + auto row1 = indices.select(0, dim1); |
| 48 | + |
| 49 | + // swap row0 and row1 |
| 50 | + auto tmp = at::zeros_like(row0, LEGACY_CONTIGUOUS_MEMORY_FORMAT); |
| 51 | + tmp.copy_(row0); |
| 52 | + row0.copy_(row1); |
| 53 | + row1.copy_(tmp); |
| 54 | + |
| 55 | + self._coalesced_(false); |
| 56 | + |
| 57 | + auto sizes = self.sizes().vec(); |
| 58 | + std::swap(sizes[dim0], sizes[dim1]); |
| 59 | + |
| 60 | + at::sparse::get_sparse_impl(self)->raw_resize_(self._indices().size(0), self._values().dim() - 1, sizes); |
| 61 | + } |
| 62 | + return self; |
| 63 | +} |
| 64 | +``` |
| 65 | +## scipy |
| 66 | +scipy中转换为csr再进行transpose |
| 67 | +```python |
| 68 | +def transpose(self, axes=None, copy=False): |
| 69 | + """ |
| 70 | + Reverses the dimensions of the sparse matrix. |
| 71 | + Parameters |
| 72 | + ---------- |
| 73 | + axes : None, optional |
| 74 | + This argument is in the signature *solely* for NumPy |
| 75 | + compatibility reasons. Do not pass in anything except |
| 76 | + for the default value. |
| 77 | + copy : bool, optional |
| 78 | + Indicates whether or not attributes of `self` should be |
| 79 | + copied whenever possible. The degree to which attributes |
| 80 | + are copied varies depending on the type of sparse matrix |
| 81 | + being used. |
| 82 | + Returns |
| 83 | + ------- |
| 84 | + p : `self` with the dimensions reversed. |
| 85 | + See Also |
| 86 | + -------- |
| 87 | + numpy.matrix.transpose : NumPy's implementation of 'transpose' |
| 88 | + for matrices |
| 89 | + """ |
| 90 | + return self.tocsr(copy=copy).transpose(axes=axes, copy=False) |
| 91 | +``` |
| 92 | +csr transpose实现如下 |
| 93 | +```python |
| 94 | + |
| 95 | +def transpose(self, axes=None, copy=False): |
| 96 | + if axes is not None: |
| 97 | + raise ValueError(("Sparse matrices do not support " |
| 98 | + "an 'axes' parameter because swapping " |
| 99 | + "dimensions is the only logical permutation.")) |
| 100 | + |
| 101 | + M, N = self.shape |
| 102 | + return self._csc_container((self.data, self.indices, |
| 103 | + self.indptr), shape=(N, M), copy=copy) |
| 104 | + |
| 105 | +``` |
| 106 | +## paddle DenseTensor |
| 107 | +参数dims在DenseTensor中被表达为perm,其长度与输入张量的维度必须相等, |
| 108 | +返回多维张量的第i维对应输入Tensor的perm[i]维。。 |
| 109 | + |
| 110 | +代码如下 |
| 111 | +```python |
| 112 | +x = [[[ 1 2 3 4] [ 5 6 7 8] [ 9 10 11 12]] |
| 113 | + [[13 14 15 16] [17 18 19 20] [21 22 23 24]]] |
| 114 | +shape(x) = [2,3,4] |
| 115 | + |
| 116 | +# 例0 |
| 117 | +perm0 = [1,0,2] |
| 118 | +y_perm0 = [[[ 1 2 3 4] [13 14 15 16]] |
| 119 | + [[ 5 6 7 8] [17 18 19 20]] |
| 120 | + [[ 9 10 11 12] [21 22 23 24]]] |
| 121 | +shape(y_perm0) = [3,2,4] |
| 122 | + |
| 123 | +# 例1 |
| 124 | +perm1 = [2,1,0] |
| 125 | +y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]] |
| 126 | + [[ 2 14] [ 6 18] [10 22]] |
| 127 | + [[ 3 15] [ 7 19] [11 23]] |
| 128 | + [[ 4 16] [ 8 20] [12 24]]] |
| 129 | +shape(y_perm1) = [4,3,2] |
| 130 | +``` |
| 131 | +但是此处是Dense的,直接使用指针在Sparse中不可行 |
| 132 | +# 四、对比分析 |
| 133 | +为了适配paddle phi库的设计模式,需自行设计实现方式 |
| 134 | +# 五、方案设计 |
| 135 | +## 命名与参数设计 |
| 136 | +在 paddle/phi/kernels/sparse/impl/unary_kernel_impl.cc 中, kernel设计为 |
| 137 | +``` |
| 138 | +template <typename T, typename Context> |
| 139 | +void TransposeCooGradKernel(const Context& dev_ctx, |
| 140 | + const SparseCooTensor& x, |
| 141 | + const SparseCooTensor& dout, |
| 142 | + SparseCooTensor* dx) |
| 143 | +template <typename T, typename Context> |
| 144 | +void TransposeCsrGradKernel(const Context& dev_ctx, |
| 145 | + const SparseCooTensor& x, |
| 146 | + const SparseCooTensor& dout, |
| 147 | + SparseCooTensor* dx) |
| 148 | +``` |
| 149 | +在 paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.cc 中, kernel设计为 |
| 150 | +``` |
| 151 | +template <typename T, typename Context> |
| 152 | +void TransposeCooKernel(const Context& dev_ctx, |
| 153 | + const SparseCsrTensor& x, |
| 154 | + const std::vector<int>& dims, |
| 155 | + SparseCsrTensor* out) |
| 156 | + template <typename T, typename Context> |
| 157 | +void TransposeCsrKernel(const Context& dev_ctx, |
| 158 | + const SparseCsrTensor& x, |
| 159 | + const std::vector<int>& dims, |
| 160 | + SparseCsrTensor* out) |
| 161 | +``` |
| 162 | +并在yaml中新增对应API |
| 163 | +```yaml |
| 164 | +- api : transpose |
| 165 | + args : (Tensor x, int[] dims) |
| 166 | + output : Tensor(out) |
| 167 | + kernel : |
| 168 | + func : transpose_coo{sparse_coo -> sparse_coo}, |
| 169 | + transpose_csr{sparse_csr -> sparse_csr} |
| 170 | + layout : x |
| 171 | + backward : transpose_grad |
| 172 | + |
| 173 | +``` |
| 174 | +```yaml |
| 175 | +- backward_api : transpose_grad |
| 176 | + forward : transpose(Tensor x, int[] shape) -> Tensor(out) |
| 177 | + args : (Tensor out, Tensor out_grad) |
| 178 | + output : Tensor(x_grad) |
| 179 | + kernel : |
| 180 | + func : transpose_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, |
| 181 | + transpose_csr_grad {sparse_csr, sparse_csr -> sparse_csr} |
| 182 | + |
| 183 | +``` |
| 184 | +## 底层OP设计 |
| 185 | +对于Coo格式,主要分为两步,第一步操作indices,通过遍历每一行, |
| 186 | +按照指定顺序复制给输出值,第二步使用DDim::transpose改变dims值。 |
| 187 | + |
| 188 | +对于Csr格式,通过分类讨论的方式,分别实现2维和3维的功能, |
| 189 | +对于2维只需要确定两个维度是否切换,3维情况也可通过较复杂的判断实现。 |
| 190 | +## API实现方案 |
| 191 | +对于SparseCsrTensor和SparseCooTensor有相同的API, |
| 192 | +均只需要给定输入张量和维度转换目标。 |
| 193 | + |
| 194 | +# 六、测试和验收的考量 |
| 195 | +测试考虑的case如下: |
| 196 | +- 正确性 |
| 197 | +- csr对2维和3维测试 |
| 198 | +- coo对2维、3维、6维和10维测试 |
| 199 | + |
| 200 | +具体样例如下 |
| 201 | +```python |
| 202 | +class TestTranspose(unittest.TestCase): |
| 203 | + # x: sparse, out: sparse |
| 204 | + def check_result(self, x_shape, dims, format): |
| 205 | + if len(x_shape) == 3: |
| 206 | + mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]]) |
| 207 | + else: |
| 208 | + mask = paddle.randint(0, 2, x_shape) |
| 209 | + origin_x = paddle.rand(x_shape) * mask |
| 210 | + |
| 211 | + dense_x = origin_x.detach() |
| 212 | + dense_x.stop_gradient = False |
| 213 | + dense_out = paddle.transpose(dense_x, dims) |
| 214 | + |
| 215 | + if format == "coo": |
| 216 | + sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) |
| 217 | + else: |
| 218 | + sp_x = origin_x.detach().to_sparse_csr() |
| 219 | + sp_x.stop_gradient = False |
| 220 | + sp_out = paddle.incubate.sparse.transpose(sp_x, dims) |
| 221 | + |
| 222 | + np.testing.assert_allclose(sp_out.numpy(), |
| 223 | + dense_out.numpy(), |
| 224 | + rtol=1e-05) |
| 225 | + if get_cuda_version() >= 11030: |
| 226 | + dense_out.backward() |
| 227 | + sp_out.backward() |
| 228 | + np.testing.assert_allclose(sp_x.grad.to_dense().numpy(), |
| 229 | + (dense_x.grad * mask).numpy(), |
| 230 | + rtol=1e-05) |
| 231 | + |
| 232 | + @unittest.skipIf(not paddle.is_compiled_with_cuda() |
| 233 | + or get_cuda_version() < 11000, "only support cuda>=11.0") |
| 234 | + def test_transpose_case1(self): |
| 235 | + self.check_result([16, 12, 3], [2, 1, 0], 'coo') |
| 236 | + self.check_result([16, 12, 3], [2, 1, 0], 'csr') |
| 237 | + |
| 238 | + @unittest.skipIf(not paddle.is_compiled_with_cuda() |
| 239 | + or get_cuda_version() < 11070, "only support cuda>=11.7") |
| 240 | + def test_transpose_case2(self): |
| 241 | + self.check_result([12, 5], [1, 0], 'coo') |
| 242 | + self.check_result([12, 5], [1, 0], 'csr') |
| 243 | + |
| 244 | + @unittest.skipIf(not paddle.is_compiled_with_cuda() |
| 245 | + or get_cuda_version() < 11070, "only support cuda>=11.7") |
| 246 | + def test_transpose_case3(self): |
| 247 | + self.check_result([8, 16, 12, 4, 2, 12], [2, 3, 4, 1, 0, 2], 'coo') |
| 248 | + |
| 249 | + @unittest.skipIf(not paddle.is_compiled_with_cuda() |
| 250 | + or get_cuda_version() < 11070, "only support cuda>=11.7") |
| 251 | + def test_transpose_case3(self): |
| 252 | + self.check_result([i + 2 for i in range(10)], |
| 253 | + [(i + 2) % 10 for i in range(10)], 'coo') |
| 254 | +``` |
| 255 | + |
| 256 | +# 七、可行性分析及规划排期 |
| 257 | +方案主要自行实现核心算法,并使用paddle现有func |
| 258 | +# 八、影响面 |
| 259 | +为独立新增op,对其他模块没有影响 |
| 260 | +# 名词解释 |
| 261 | +无 |
| 262 | +# 附件及参考资料 |
| 263 | +无 |
0 commit comments