div.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
  13. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
  14. #include <algorithm>
  15. #include "tensorflow/lite/kernels/internal/common.h"
  16. namespace tflite {
  17. namespace reference_ops {
  18. template <typename T>
  19. inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
  20. TFLITE_DCHECK_LE(params.quantized_activation_min,
  21. params.quantized_activation_max);
  22. // Input offset is negative input zero point. Activation tensors are
  23. // asymmetric quantized so they span the full int8 range.
  24. constexpr int32_t max_value =
  25. static_cast<int32_t>(std::numeric_limits<T>::max());
  26. TFLITE_DCHECK_GE(params.input1_offset, -max_value);
  27. TFLITE_DCHECK_LE(params.input1_offset, max_value);
  28. TFLITE_DCHECK_GE(params.input2_offset, -max_value);
  29. TFLITE_DCHECK_LE(params.input2_offset, max_value);
  30. TFLITE_DCHECK_GE(params.output_offset, -max_value);
  31. TFLITE_DCHECK_LE(params.output_offset, max_value);
  32. }
  33. // Element-wise div that can often be used for inner loop of broadcast Div as
  34. // well as the non-broadcast Div.
  35. template <typename T>
  36. inline void DivElementwise(int size, const ArithmeticParams& params,
  37. const T* input1_data, const T* input2_data,
  38. T* output_data) {
  39. DivCheckArithmeticParams<T>(params);
  40. for (int i = 0; i < size; ++i) {
  41. const int32_t input1_val = params.input1_offset + input1_data[i];
  42. const int32_t input2_val = params.input2_offset + input2_data[i];
  43. TFLITE_DCHECK_NE(input2_val, 0);
  44. int recip_shift;
  45. const int32_t input2_inv =
  46. (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
  47. : -GetReciprocal(-input2_val, 31, &recip_shift);
  48. const int headroom = CountLeadingSignBits(input1_val);
  49. const int32_t unscaled_quotient =
  50. MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
  51. headroom);
  52. const int total_shift = params.output_shift - recip_shift - headroom;
  53. const int32_t unclamped_result =
  54. params.output_offset +
  55. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  56. unscaled_quotient, params.output_multiplier, total_shift);
  57. const int32_t clamped_output =
  58. std::min(params.quantized_activation_max,
  59. std::max(params.quantized_activation_min, unclamped_result));
  60. output_data[i] = static_cast<T>(clamped_output);
  61. }
  62. }
  63. inline void Div(const ArithmeticParams& params,
  64. const RuntimeShape& input1_shape, const uint8_t* input1_data,
  65. const RuntimeShape& input2_shape, const uint8_t* input2_data,
  66. const RuntimeShape& output_shape, uint8_t* output_data) {
  67. TFLITE_DCHECK_LE(params.quantized_activation_min,
  68. params.quantized_activation_max);
  69. const int flat_size =
  70. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  71. DivElementwise(flat_size, params, input1_data, input2_data, output_data);
  72. }
  73. inline void Div(const ArithmeticParams& params,
  74. const RuntimeShape& input1_shape, const int8_t* input1_data,
  75. const RuntimeShape& input2_shape, const int8_t* input2_data,
  76. const RuntimeShape& output_shape, int8_t* output_data) {
  77. TFLITE_DCHECK_LE(params.quantized_activation_min,
  78. params.quantized_activation_max);
  79. const int flat_size =
  80. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  81. DivElementwise(flat_size, params, input1_data, input2_data, output_data);
  82. }
  83. template <typename T, int N = 5>
  84. inline void BroadcastDivSlowQuantized(
  85. const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
  86. const T* input1_data, const RuntimeShape& unextended_input2_shape,
  87. const T* input2_data, const RuntimeShape& unextended_output_shape,
  88. T* output_data) {
  89. TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
  90. TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
  91. TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
  92. NdArrayDesc<N> desc1;
  93. NdArrayDesc<N> desc2;
  94. NdArrayDesc<N> output_desc;
  95. NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
  96. unextended_input2_shape, &desc1, &desc2);
  97. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
  98. &output_desc);
  99. DivCheckArithmeticParams<T>(params);
  100. auto div_func = [&](int indexes[N]) {
  101. const int32_t input1_val =
  102. params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
  103. const int32_t input2_val =
  104. params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
  105. TFLITE_DCHECK_NE(input2_val, 0);
  106. int recip_shift;
  107. const int32_t input2_inv =
  108. (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
  109. : -GetReciprocal(-input2_val, 31, &recip_shift);
  110. const int headroom = CountLeadingSignBits(input1_val);
  111. const int32_t unscaled_quotient =
  112. MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
  113. headroom);
  114. const int total_shift = params.output_shift - recip_shift - headroom;
  115. const int32_t unclamped_result =
  116. params.output_offset +
  117. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  118. unscaled_quotient, params.output_multiplier, total_shift);
  119. const int32_t clamped_output =
  120. std::min(params.quantized_activation_max,
  121. std::max(params.quantized_activation_min, unclamped_result));
  122. output_data[SubscriptToIndex(output_desc, indexes)] =
  123. static_cast<T>(clamped_output);
  124. };
  125. NDOpsHelper<N>(output_desc, div_func);
  126. }
  127. template <int N = 5>
  128. inline void BroadcastDivSlow(const ArithmeticParams& params,
  129. const RuntimeShape& unextended_input1_shape,
  130. const uint8_t* input1_data,
  131. const RuntimeShape& unextended_input2_shape,
  132. const uint8_t* input2_data,
  133. const RuntimeShape& unextended_output_shape,
  134. uint8_t* output_data) {
  135. BroadcastDivSlowQuantized<uint8_t, N>(
  136. params, unextended_input1_shape, input1_data, unextended_input2_shape,
  137. input2_data, unextended_output_shape, output_data);
  138. }
  139. template <int N = 5>
  140. inline void BroadcastDivSlow(const ArithmeticParams& params,
  141. const RuntimeShape& unextended_input1_shape,
  142. const int8_t* input1_data,
  143. const RuntimeShape& unextended_input2_shape,
  144. const int8_t* input2_data,
  145. const RuntimeShape& unextended_output_shape,
  146. int8_t* output_data) {
  147. BroadcastDivSlowQuantized<int8_t, N>(
  148. params, unextended_input1_shape, input1_data, unextended_input2_shape,
  149. input2_data, unextended_output_shape, output_data);
  150. }
  151. // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
  152. // dimensionality if the runtime code does a single loop over one dimension
  153. // that handles broadcasting as the base case. The code generator would then
  154. // generate max(D1, D2) nested for loops.
  155. template <typename T, int N = 5>
  156. void BroadcastDivSlow(const ArithmeticParams& params,
  157. const RuntimeShape& unextended_input1_shape,
  158. const T* input1_data,
  159. const RuntimeShape& unextended_input2_shape,
  160. const T* input2_data,
  161. const RuntimeShape& unextended_output_shape,
  162. T* output_data) {
  163. T output_activation_min;
  164. T output_activation_max;
  165. GetActivationParams(params, &output_activation_min, &output_activation_max);
  166. TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
  167. TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
  168. TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
  169. NdArrayDesc<N> desc1;
  170. NdArrayDesc<N> desc2;
  171. NdArrayDesc<N> output_desc;
  172. NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
  173. unextended_input2_shape, &desc1, &desc2);
  174. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
  175. &output_desc);
  176. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  177. // col, channel), with extents (batches, height, width, depth), with the
  178. // trailing dimension changing most rapidly (channels has the smallest
  179. // stride, typically 1 element).
  180. //
  181. // In generated C code, we store arrays with the dimensions reversed. The
  182. // first dimension has smallest stride.
  183. auto div_func = [&](int indexes[N]) {
  184. output_data[SubscriptToIndex(output_desc, indexes)] =
  185. ActivationFunctionWithMinMax(
  186. input1_data[SubscriptToIndex(desc1, indexes)] /
  187. input2_data[SubscriptToIndex(desc2, indexes)],
  188. output_activation_min, output_activation_max);
  189. };
  190. NDOpsHelper<N>(output_desc, div_func);
  191. }
  192. template <typename T>
  193. inline void Div(const ArithmeticParams& params,
  194. const RuntimeShape& input1_shape, const T* input1_data,
  195. const RuntimeShape& input2_shape, const T* input2_data,
  196. const RuntimeShape& output_shape, T* output_data) {
  197. T output_activation_min;
  198. T output_activation_max;
  199. GetActivationParams(params, &output_activation_min, &output_activation_max);
  200. const int flat_size =
  201. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  202. for (int i = 0; i < flat_size; ++i) {
  203. output_data[i] = ActivationFunctionWithMinMax(
  204. input1_data[i] / input2_data[i], output_activation_min,
  205. output_activation_max);
  206. }
  207. }
  208. } // namespace reference_ops
  209. } // namespace tflite
  210. #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_