Skip to content

Commit cb7b485

Browse files
authored
fix quantization from fp64 to fp32 (#1153)
1 parent 15a05fd commit cb7b485

File tree

1 file changed

+3
-2
lines changed
  • neural_compressor/adaptor/ox_utils

1 file changed

+3
-2
lines changed

neural_compressor/adaptor/ox_utils/util.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,12 @@ def quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point):
155155
- when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
156156
m = max(abs(rmin), abs(rmax))
157157
'''
158+
data = np.asarray(data)
158159
if qType == onnx_proto.TensorProto.INT8 and scheme == 'sym':
159160
# signed byte type
160-
quantized_data = (np.asarray(data) / scale).round().astype('b')
161+
quantized_data = (data.astype(np.float32) / scale).round().astype('b')
161162
elif qType == onnx_proto.TensorProto.UINT8 and scheme == 'asym':
162-
quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B')
163+
quantized_data = ((data.astype(np.float32) / scale).round() + zero_point).astype('B')
163164
else:
164165
raise ValueError("Unexpected combination of data type {} and scheme {}.".format(
165166
qType, scheme))

0 commit comments

Comments
 (0)