quantization_util.cc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #include "tensorflow/lite/kernels/internal/quantization_util.h"
  13. #include <algorithm>
  14. #include <cmath>
  15. #include <limits>
  16. #include "tensorflow/lite/kernels/internal/compatibility.h"
  17. #include "tensorflow/lite/kernels/internal/cppmath.h"
  18. namespace tflite {
  19. namespace {
  20. // These constants are used to manipulate the binary representation of doubles.
  21. // Double-precision binary64 floating point format is:
  22. // Bit | 63 | 62-52 | 51-0 |
  23. // | Sign | Exponent | Fraction |
  24. // To avoid 64-bit integers as much as possible, I break this into high and
  25. // low 32-bit chunks. High is:
  26. // Bit | 31 | 30-20 | 19-0 |
  27. // | Sign | Exponent | High Fraction |
  28. // Low is:
  29. // Bit | 31-0 |
  30. // | Low Fraction |
  31. // We then access the components through logical bit-wise operations to
  32. // extract the parts needed, with the positions and masks derived from the
  33. // layout shown above.
  34. constexpr uint64_t kSignMask = 0x8000000000000000LL;
  35. constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
  36. constexpr int32_t kExponentShift = 52;
  37. constexpr int32_t kExponentBias = 1023;
  38. constexpr uint32_t kExponentIsBadNum = 0x7ff;
  39. constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
  40. constexpr uint32_t kFractionShift = 22;
  41. constexpr uint32_t kFractionRoundingMask = 0x003fffff;
  42. constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
  43. } // namespace
  44. void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
  45. int* shift) {
  46. #if TFLITE_SINGLE_ROUNDING
  47. // Single-rounding MultiplyByQuantizedMultiplier only supports positive
  48. // multipliers.
  49. // TFLITE_DCHECK(double_multiplier >= 0);
  50. #endif
  51. if (double_multiplier == 0.) {
  52. *quantized_multiplier = 0;
  53. *shift = 0;
  54. return;
  55. }
  56. #ifdef TFLITE_EMULATE_FLOAT
  57. // If we're trying to avoid the use of floating-point instructions (for
  58. // example on microcontrollers) then use an alternative implementation
  59. // that only requires integer and bitwise operations. To enable this, you
  60. // need to set the define during the build process for your platform.
  61. int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
  62. #else // TFLITE_EMULATE_FLOAT
  63. const double q = std::frexp(double_multiplier, shift);
  64. auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1LL << 31)));
  65. #endif // TFLITE_EMULATE_FLOAT
  66. TFLITE_CHECK(q_fixed <= (1LL << 31));
  67. if (q_fixed == (1LL << 31)) {
  68. q_fixed /= 2;
  69. ++*shift;
  70. }
  71. TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
  72. // A shift amount smaller than -31 would cause all bits to be shifted out
  73. // and thus all results would be zero. We implement that instead with
  74. // q_fixed==0, so as to avoid hitting issues with right-shift
  75. // operations with shift amounts greater than 31. Note that this happens
  76. // roughly when abs(double_multiplier) < 2^-31 and the present handling means
  77. // that we're effectively flushing tiny double_multiplier's to zero.
  78. // We could conceivably handle values in the range (roughly) [32, 63]
  79. // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
  80. // the present handling is just doing 'flush denormals to zero'. We could
  81. // reconsider and actually generate nonzero denormals if a need arises.
  82. if (*shift < -31) {
  83. *shift = 0;
  84. q_fixed = 0;
  85. }
  86. #if TFLITE_SINGLE_ROUNDING
  87. // Single-rounding MultiplyByQuantizedMultiplier doesn't support a shift > 30,
  88. // saturate it.
  89. if (*shift > 30) {
  90. *shift = 30;
  91. q_fixed = (1LL << 31) - 1;
  92. }
  93. #endif
  94. *quantized_multiplier = static_cast<int32_t>(q_fixed);
  95. }
  96. void QuantizeMultiplierGreaterThanOne(double double_multiplier,
  97. int32_t* quantized_multiplier,
  98. int* left_shift) {
  99. TFLITE_CHECK_GT(double_multiplier, 1.);
  100. QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
  101. TFLITE_CHECK_GE(*left_shift, 0);
  102. }
  103. void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
  104. int32_t* quantized_multiplier,
  105. int* left_shift) {
  106. TFLITE_CHECK_LT(double_multiplier, 1.);
  107. TFLITE_CHECK_GT(double_multiplier, 0.);
  108. int shift;
  109. QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
  110. TFLITE_CHECK_LE(shift, 0);
  111. *left_shift = shift;
  112. }
  113. int64_t IntegerFrExp(double input, int* shift) {
  114. // Make sure our assumptions about the double layout hold.
  115. TFLITE_CHECK_EQ(8, sizeof(double));
  116. // We want to access the bits of the input double value directly, which is
  117. // tricky to do safely, so use a union to handle the casting.
  118. union {
  119. double double_value;
  120. uint64_t double_as_uint;
  121. } cast_union;
  122. cast_union.double_value = input;
  123. const uint64_t u = cast_union.double_as_uint;
  124. // If the bitfield is all zeros apart from the sign bit, this is a normalized
  125. // zero value, so return standard values for this special case.
  126. if ((u & ~kSignMask) == 0) {
  127. *shift = 0;
  128. return 0;
  129. }
  130. // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
  131. // the exponent, and distinguished by whether the fractions are zero or
  132. // non-zero.
  133. const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
  134. if (exponent_part == kExponentIsBadNum) {
  135. *shift = std::numeric_limits<int>::max();
  136. if (u & kFractionMask) {
  137. // NaN, so just return zero (with the exponent set to INT_MAX).
  138. return 0;
  139. } else {
  140. // Infinity, so return +/- INT_MAX.
  141. if (u & kSignMask) {
  142. return std::numeric_limits<int64_t>::min();
  143. } else {
  144. return std::numeric_limits<int64_t>::max();
  145. }
  146. }
  147. }
  148. // The shift is fairly easy to extract from the high bits of the double value,
  149. // just by masking it out and applying a bias. The std::frexp() implementation
  150. // always returns values between 0.5 and 1.0 though, whereas the exponent
  151. // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
  152. // interface.
  153. *shift = (exponent_part - kExponentBias) + 1;
  154. // There's an implicit high bit in the double format definition, so make sure
  155. // we include that at the top, and then reconstruct the rest of the fractional
  156. // value from the remaining fragments.
  157. int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
  158. // We're cutting off some bits at the bottom, so to exactly match the standard
  159. // frexp implementation here we'll apply rounding by adding one to the least
  160. // significant bit of the result if the discarded portion is over half of the
  161. // maximum.
  162. if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
  163. fraction += 1;
  164. }
  165. // Negate the fraction if the sign bit was set.
  166. if (u & kSignMask) {
  167. fraction *= -1;
  168. }
  169. return fraction;
  170. }
  171. double DoubleFromFractionAndShift(int64_t fraction, int shift) {
  172. union {
  173. double double_value;
  174. uint64_t double_as_uint;
  175. } result;
  176. // Detect NaNs and infinities.
  177. if (shift == std::numeric_limits<int>::max()) {
  178. if (fraction == 0) {
  179. return std::numeric_limits<double>::quiet_NaN();
  180. } else if (fraction > 0) {
  181. return std::numeric_limits<double>::infinity();
  182. } else {
  183. return -std::numeric_limits<double>::infinity();
  184. }
  185. }
  186. // Return a normalized zero for a zero fraction.
  187. if (fraction == 0) {
  188. result.double_as_uint = 0;
  189. return result.double_value;
  190. }
  191. bool is_negative = (fraction < 0);
  192. int64_t encoded_fraction = is_negative ? -fraction : fraction;
  193. int64_t encoded_shift = (shift - 1);
  194. while (encoded_fraction < 0x40000000) {
  195. encoded_fraction *= 2;
  196. encoded_shift -= 1;
  197. }
  198. while (encoded_fraction > 0x80000000) {
  199. encoded_fraction /= 2;
  200. encoded_shift += 1;
  201. }
  202. encoded_fraction -= 0x40000000;
  203. if (encoded_shift < -1022) {
  204. encoded_shift = -1023;
  205. } else if (encoded_shift > 1022) {
  206. encoded_shift = 1023;
  207. }
  208. encoded_shift += kExponentBias;
  209. uint64_t encoded_sign = is_negative ? kSignMask : 0;
  210. result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
  211. (encoded_fraction << kFractionShift);
  212. return result.double_value;
  213. }
  214. double IntegerDoubleMultiply(double a, double b) {
  215. int a_shift;
  216. const int64_t a_fraction = IntegerFrExp(a, &a_shift);
  217. int b_shift;
  218. const int64_t b_fraction = IntegerFrExp(b, &b_shift);
  219. // Detect NaNs and infinities.
  220. if (a_shift == std::numeric_limits<int>::max() ||
  221. (b_shift == std::numeric_limits<int>::max())) {
  222. return std::numeric_limits<double>::quiet_NaN();
  223. }
  224. const int result_shift = a_shift + b_shift + 1;
  225. const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
  226. return DoubleFromFractionAndShift(result_fraction, result_shift);
  227. }
  228. int IntegerDoubleCompare(double a, double b) {
  229. int a_shift;
  230. const int64_t a_fraction = IntegerFrExp(a, &a_shift);
  231. int b_shift;
  232. const int64_t b_fraction = IntegerFrExp(b, &b_shift);
  233. // Detect NaNs and infinities.
  234. if (a_shift == std::numeric_limits<int>::max() ||
  235. (b_shift == std::numeric_limits<int>::max())) {
  236. return 1;
  237. }
  238. if ((a_fraction == 0) && (b_fraction < 0)) {
  239. return 1;
  240. } else if ((a_fraction < 0) && (b_fraction == 0)) {
  241. return -1;
  242. } else if (a_shift < b_shift) {
  243. return -1;
  244. } else if (a_shift > b_shift) {
  245. return 1;
  246. } else if (a_fraction < b_fraction) {
  247. return -1;
  248. } else if (a_fraction > b_fraction) {
  249. return 1;
  250. } else {
  251. return 0;
  252. }
  253. }
  254. void PreprocessSoftmaxScaling(double beta, double input_scale,
  255. int input_integer_bits,
  256. int32_t* quantized_multiplier, int* left_shift) {
  257. // If the overall multiplier (input and beta) is large, then exp() of an
  258. // input difference of 1 scaled by this will be large. In other words, we
  259. // can cap the multiplier and know that, when it is used, the output will be
  260. // (round to) zero wherever the input is not at the maximum value.
  261. // If the overall scale is less than one, and input_integer_bits=0, then the
  262. // result is double equivalent of Q0.31 (actually with more precision). Thus
  263. // this generates a Q(input_integer_bits).(31-input_integer_bits)
  264. // representation.
  265. #if TFLITE_SINGLE_ROUNDING
  266. const double max_real_multiplier = (1LL << 30) - 1.0;
  267. #else
  268. const double max_real_multiplier = (1LL << 31) - 1.0;
  269. #endif
  270. #ifdef TFLITE_EMULATE_FLOAT
  271. const double input_beta = IntegerDoubleMultiply(beta, input_scale);
  272. int shift;
  273. int64_t fraction = IntegerFrExp(input_beta, &shift);
  274. shift += (31 - input_integer_bits);
  275. double input_beta_real_multiplier =
  276. DoubleFromFractionAndShift(fraction, shift);
  277. if (IntegerDoubleCompare(input_beta_real_multiplier, max_real_multiplier) >
  278. 0) {
  279. input_beta_real_multiplier = max_real_multiplier;
  280. }
  281. #else // TFLITE_EMULATE_FLOAT
  282. const double input_beta_real_multiplier =
  283. std::min<double>(beta * input_scale * (1 << (31 - input_integer_bits)),
  284. max_real_multiplier);
  285. #endif // TFLITE_EMULATE_FLOAT
  286. QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
  287. quantized_multiplier, left_shift);
  288. }
  289. void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
  290. int input_integer_bits,
  291. int32_t* quantized_multiplier,
  292. int* left_shift,
  293. int32_t* reverse_scaling_divisor,
  294. int* reverse_scaling_left_shift) {
  295. PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
  296. quantized_multiplier, left_shift);
  297. // Also calculate what amounts to the inverse scaling factor for the input.
  298. const double real_reverse_scaling_divisor =
  299. (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
  300. tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
  301. reverse_scaling_divisor,
  302. reverse_scaling_left_shift);
  303. }
  304. int CalculateInputRadius(int input_integer_bits, int input_left_shift,
  305. int total_signed_bits) {
  306. #ifdef TFLITE_EMULATE_FLOAT
  307. int64_t result = (1 << input_integer_bits) - 1;
  308. result <<= (total_signed_bits - input_integer_bits);
  309. result >>= input_left_shift;
  310. return result;
  311. #else // TFLITE_EMULATE_FLOAT
  312. const double max_input_rescaled =
  313. 1.0 * ((1 << input_integer_bits) - 1) *
  314. (1LL << (total_signed_bits - input_integer_bits)) /
  315. (1LL << input_left_shift);
  316. // Tighten bound using floor. Suppose that we could use the exact value.
  317. // After scaling the difference, the result would be at the maximum. Thus we
  318. // must ensure that our value has lower magnitude.
  319. return static_cast<int>(std::floor(max_input_rescaled));
  320. #endif // TFLITE_EMULATE_FLOAT
  321. }
  322. void NudgeQuantizationRange(const float min, const float max,
  323. const int quant_min, const int quant_max,
  324. float* nudged_min, float* nudged_max,
  325. float* nudged_scale) {
  326. // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
  327. const float quant_min_float = static_cast<float>(quant_min);
  328. const float quant_max_float = static_cast<float>(quant_max);
  329. *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
  330. const float zero_point_from_min = quant_min_float - min / *nudged_scale;
  331. uint16_t nudged_zero_point;
  332. if (zero_point_from_min < quant_min_float) {
  333. nudged_zero_point = static_cast<uint16_t>(quant_min);
  334. } else if (zero_point_from_min > quant_max_float) {
  335. nudged_zero_point = static_cast<uint16_t>(quant_max);
  336. } else {
  337. nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
  338. }
  339. *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
  340. *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
  341. }
  342. void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
  343. const float nudged_max, const float* input_data,
  344. float* output_data, const float size) {
  345. // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
  346. const float inv_nudged_scale = 1.0f / nudged_scale;
  347. for (int i = 0; i < size; i++) {
  348. const float src_val = input_data[i];
  349. const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
  350. const float clamped_shifted = clamped - nudged_min;
  351. const float dst_val =
  352. TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
  353. nudged_min;
  354. output_data[i] = dst_val;
  355. }
  356. }
  357. bool CheckedLog2(const float x, int* log2_result) {
  358. // Using TfLiteRound instead of std::round and std::log instead of
  359. // std::log2 to work around these functions being missing in a toolchain
  360. // used in some TensorFlow tests as of May 2018.
  361. const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
  362. const float x_log2_rounded = TfLiteRound(x_log2);
  363. const float x_log2_fracpart = x_log2 - x_log2_rounded;
  364. *log2_result = static_cast<int>(x_log2_rounded);
  365. return std::abs(x_log2_fracpart) < 1e-3f;
  366. }
  367. void QuantizeMultiplierArray(const double* effective_scales, size_t size,
  368. int32_t* effective_scale_significand,
  369. int* effective_shift) {
  370. for (size_t i = 0; i < size; ++i) {
  371. QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
  372. &effective_shift[i]);
  373. }
  374. }
  375. } // namespace tflite