Quantization Operators¶
Quantization is a model optimization technique to reduce the size of a large model in order to achieve better storage performance with a small loss in accuracy.
CUDA Operators¶
-
DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)
Converts a tensor of
float
values into a tensor of Brain Floating Point (bfloat16
) values.
-
DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)
Converts a tensor of Brain Floating Point (
bfloat16
) values into a tensor offloat
values.
-
DLL_PUBLIC Tensor _float_to_FP8rowwise_gpu (const Tensor &input, const bool forward)
-
DLL_PUBLIC Tensor _float_to_fused8bitrowwise_gpu (const Tensor &input)
-
DLL_PUBLIC Tensor _single_or_half_precision_to_fused8bitrowwise_gpu (const Tensor &input)
-
DLL_PUBLIC at::Tensor _fused8bitrowwise_to_single_or_half_precision_gpu (const at::Tensor &input, const int64_t output_dtype)
-
DLL_PUBLIC at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu (const at::Tensor &input, const at::Tensor &D_offsets, const int64_t output_dtype)
-
template<typename
input_t
>
Tensor_float_to_fusednbitrowwise_gpu_t
(const Tensor &input, const int64_t bit_rate)¶
-
DLL_PUBLIC Tensor _float_to_fusednbitrowwise_gpu (const Tensor &input, const int64_t bit_rate)
-
DLL_PUBLIC at::Tensor _half_to_fusednbitrowwise_gpu (const at::Tensor &input, const int64_t bit_rate)
-
template<typename
output_t
>
Tensor_fusednbitrowwise_to_float_gpu_t
(const Tensor &input, const int64_t bit_rate)¶
-
DLL_PUBLIC at::Tensor _fusednbitrowwise_to_half_gpu (const at::Tensor &input, const int64_t bit_rate)
-
DLL_PUBLIC at::Tensor _fusednbitrowwise_to_float_or_half_gpu (const at::Tensor &input, const int64_t bit_rate, const int64_t output_dtype)
-
DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)
Converts a tensor of
float
values into a tensor of Hybrid 8-bit Floating Point (hfp8
) values.
-
DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)
Converts a tensor of Hybrid 8-bit Floating Point (
hfp8
) values into a tensor offloat
values.
-
DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)
Converts a tensor of
float
values into a tensor of Microsoft Floating Point (msfp
) values.
-
DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)
Converts a tensor of Microsoft Floating Point (
msfp
) values into a tensor offloat
values.
-
DLL_PUBLIC Tensor _float_to_paddedFP8rowwise_gpu (const Tensor &input, const bool forward, const int64_t row_dim)
CPU Operators¶
-
Tensor &
_fused8bitrowwise_to_float_cpu_out
(Tensor &output, const Tensor &input)¶
-
Tensor &
_float_to_fused8bitrowwise_cpu_out
(Tensor &output, const Tensor &input)¶
-
Tensor
float_to_fused8bitrowwise_cpu
(const Tensor &input)¶
-
Tensor
half_to_fused8bitrowwise_cpu
(const Tensor &input)¶
-
Tensor
float_or_half_to_fused8bitrowwise_cpu
(const Tensor &input)¶
-
Tensor
fused8bitrowwise_to_float_cpu
(const Tensor &input)¶
-
Tensor
fused8bitrowwise_to_half_cpu
(const Tensor &input)¶
-
Tensor
fused8bitrowwise_to_float_or_half_cpu
(const Tensor &input, const int64_t output_dtype)¶
-
Tensor
float_to_FP8rowwise_cpu
(const Tensor &input, bool forward)¶
-
Tensor
FP8rowwise_to_float_cpu
(const Tensor &input, bool forward, const int64_t output_dtype)¶
-
Tensor
fusednbitrowwise_to_float_cpu
(const Tensor &input, const int64_t bit_rate)¶
-
Tensor
fusednbitrowwise_to_half_cpu
(const Tensor &input, const int64_t bit_rate)¶
-
Tensor
fusednbitrowwise_to_float_or_half_cpu
(const Tensor &input, const int64_t bit_rate, const int64_t output_dtype)¶
-
void
FloatToFP8Quantized_ref
(const float *const input, const size_t nrows, const size_t ncols, uint8_t *const output, const int ebits, const int exponent_bias, const double max_pos)¶
-
void
FP8QuantizedToFloat_ref
(const uint8_t *const input, const size_t nrows, const size_t ncols, float *const output, const int ebits, const int exponent_bias)¶