sub.h 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
  13. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
  14. #include <stdint.h>
  15. #include <algorithm>
  16. #include <limits>
  17. #include "ruy/profiler/instrumentation.h" // from @ruy
  18. #include "tensorflow/lite/kernels/internal/common.h"
  19. #include "tensorflow/lite/kernels/internal/compatibility.h"
  20. #include "tensorflow/lite/kernels/internal/types.h"
  21. namespace tflite {
  22. namespace reference_ops {
  23. inline void SubNonBroadcast(const ArithmeticParams& params,
  24. const RuntimeShape& input1_shape,
  25. const float* input1_data,
  26. const RuntimeShape& input2_shape,
  27. const float* input2_data,
  28. const RuntimeShape& output_shape,
  29. float* output_data) {
  30. const int flat_size =
  31. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  32. for (int i = 0; i < flat_size; ++i) {
  33. output_data[i] = ActivationFunctionWithMinMax(
  34. input1_data[i] - input2_data[i], params.float_activation_min,
  35. params.float_activation_max);
  36. }
  37. }
  38. inline void SubNonBroadcast(const ArithmeticParams& params,
  39. const RuntimeShape& input1_shape,
  40. const int32_t* input1_data,
  41. const RuntimeShape& input2_shape,
  42. const int32_t* input2_data,
  43. const RuntimeShape& output_shape,
  44. int32_t* output_data) {
  45. const int flat_size =
  46. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  47. for (int i = 0; i < flat_size; ++i) {
  48. output_data[i] = ActivationFunctionWithMinMax(
  49. input1_data[i] - input2_data[i], params.quantized_activation_min,
  50. params.quantized_activation_max);
  51. }
  52. }
  53. // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
  54. // dimensionality if the runtime code does a single loop over one dimension
  55. // that handles broadcasting as the base case. The code generator would then
  56. // generate max(D1, D2) nested for loops.
  57. template <int N = 5>
  58. inline void BroadcastSubSlow(const ArithmeticParams& params,
  59. const RuntimeShape& input1_shape,
  60. const float* input1_data,
  61. const RuntimeShape& input2_shape,
  62. const float* input2_data,
  63. const RuntimeShape& output_shape,
  64. float* output_data) {
  65. ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
  66. TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  67. TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  68. TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  69. NdArrayDesc<N> desc1;
  70. NdArrayDesc<N> desc2;
  71. NdArrayDesc<N> output_desc;
  72. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  73. &desc2);
  74. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  75. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  76. // col, channel), with extents (batches, height, width, depth), with the
  77. // trailing dimension changing most rapidly (channels has the smallest stride,
  78. // typically 1 element).
  79. //
  80. // In generated C code, we store arrays with the dimensions reversed. The
  81. // first dimension has smallest stride.
  82. //
  83. // We name our variables by their Tensorflow convention, but generate C code
  84. // nesting loops such that the innermost loop has the smallest stride for the
  85. // best cache behavior.
  86. auto sub_func = [&](int indexes[N]) {
  87. output_data[SubscriptToIndex(output_desc, indexes)] =
  88. ActivationFunctionWithMinMax(
  89. input1_data[SubscriptToIndex(desc1, indexes)] -
  90. input2_data[SubscriptToIndex(desc2, indexes)],
  91. params.float_activation_min, params.float_activation_max);
  92. };
  93. NDOpsHelper<N>(output_desc, sub_func);
  94. }
  95. template <int N = 5>
  96. inline void BroadcastSubSlow(const ArithmeticParams& params,
  97. const RuntimeShape& input1_shape,
  98. const uint8_t* input1_data,
  99. const RuntimeShape& input2_shape,
  100. const uint8_t* input2_data,
  101. const RuntimeShape& output_shape,
  102. uint8_t* output_data) {
  103. ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
  104. TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  105. TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  106. TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  107. NdArrayDesc<N> desc1;
  108. NdArrayDesc<N> desc2;
  109. NdArrayDesc<N> output_desc;
  110. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  111. &desc2);
  112. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  113. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  114. // col, channel), with extents (batches, height, width, depth), with the
  115. // trailing dimension changing most rapidly (channels has the smallest stride,
  116. // typically 1 element).
  117. //
  118. // In generated C code, we store arrays with the dimensions reversed. The
  119. // first dimension has smallest stride.
  120. //
  121. // We name our variables by their Tensorflow convention, but generate C code
  122. // nesting loops such that the innermost loop has the smallest stride for the
  123. // best cache behavior.
  124. auto sub_func = [&](int indexes[N]) {
  125. const int32_t input1_val =
  126. params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
  127. const int32_t input2_val =
  128. params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
  129. const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
  130. const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
  131. const int32_t scaled_input1_val =
  132. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  133. shifted_input1_val, params.input1_multiplier, params.input1_shift);
  134. const int32_t scaled_input2_val =
  135. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  136. shifted_input2_val, params.input2_multiplier, params.input2_shift);
  137. const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
  138. const int32_t raw_output =
  139. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  140. raw_sub, params.output_multiplier, params.output_shift) +
  141. params.output_offset;
  142. const int32_t clamped_output =
  143. std::min(params.quantized_activation_max,
  144. std::max(params.quantized_activation_min, raw_output));
  145. output_data[SubscriptToIndex(output_desc, indexes)] =
  146. static_cast<uint8_t>(clamped_output);
  147. };
  148. NDOpsHelper<N>(output_desc, sub_func);
  149. }
  150. template <int N = 5>
  151. inline void BroadcastSubSlow(const ArithmeticParams& params,
  152. const RuntimeShape& input1_shape,
  153. const int32_t* input1_data,
  154. const RuntimeShape& input2_shape,
  155. const int32_t* input2_data,
  156. const RuntimeShape& output_shape,
  157. int32_t* output_data) {
  158. ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
  159. TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  160. TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  161. TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  162. NdArrayDesc<N> desc1;
  163. NdArrayDesc<N> desc2;
  164. NdArrayDesc<N> output_desc;
  165. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  166. &desc2);
  167. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  168. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  169. // col, channel), with extents (batches, height, width, depth), with the
  170. // trailing dimension changing most rapidly (channels has the smallest stride,
  171. // typically 1 element).
  172. //
  173. // In generated C code, we store arrays with the dimensions reversed. The
  174. // first dimension has smallest stride.
  175. //
  176. // We name our variables by their Tensorflow convention, but generate C code
  177. // nesting loops such that the innermost loop has the smallest stride for the
  178. // best cache behavior.
  179. auto sub_func = [&](int indexes[N]) {
  180. output_data[SubscriptToIndex(output_desc, indexes)] =
  181. ActivationFunctionWithMinMax(
  182. input1_data[SubscriptToIndex(desc1, indexes)] -
  183. input2_data[SubscriptToIndex(desc2, indexes)],
  184. params.quantized_activation_min, params.quantized_activation_max);
  185. };
  186. NDOpsHelper<N>(output_desc, sub_func);
  187. }
  188. template <int N = 5>
  189. inline void BroadcastSubSlow(const ArithmeticParams& params,
  190. const RuntimeShape& input1_shape,
  191. const int8_t* input1_data,
  192. const RuntimeShape& input2_shape,
  193. const int8_t* input2_data,
  194. const RuntimeShape& output_shape,
  195. int8_t* output_data) {
  196. ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
  197. NdArrayDesc<N> desc1;
  198. NdArrayDesc<N> desc2;
  199. NdArrayDesc<N> output_desc;
  200. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  201. &desc2);
  202. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  203. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  204. // col, channel), with extents (batches, height, width, depth), with the
  205. // trailing dimension changing most rapidly (channels has the smallest stride,
  206. // typically 1 element).
  207. //
  208. // In generated C code, we store arrays with the dimensions reversed. The
  209. // first dimension has smallest stride.
  210. //
  211. // We name our variables by their Tensorflow convention, but generate C code
  212. // nesting loops such that the innermost loop has the smallest stride for the
  213. // best cache behavior.
  214. auto sub_func = [&](int indexes[N]) {
  215. const int32_t input1_val =
  216. params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
  217. const int32_t input2_val =
  218. params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
  219. const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
  220. const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
  221. const int32_t scaled_input1_val =
  222. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  223. shifted_input1_val, params.input1_multiplier, params.input1_shift);
  224. const int32_t scaled_input2_val =
  225. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  226. shifted_input2_val, params.input2_multiplier, params.input2_shift);
  227. const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
  228. const int32_t raw_output =
  229. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  230. raw_sub, params.output_multiplier, params.output_shift) +
  231. params.output_offset;
  232. const int32_t clamped_output =
  233. std::min(params.quantized_activation_max,
  234. std::max(params.quantized_activation_min, raw_output));
  235. output_data[SubscriptToIndex(output_desc, indexes)] =
  236. static_cast<int8_t>(clamped_output);
  237. };
  238. NDOpsHelper<N>(output_desc, sub_func);
  239. }
  240. template <int N = 5>
  241. void BroadcastSubSlow(const ArithmeticParams& params,
  242. const RuntimeShape& input1_shape,
  243. const int64_t* input1_data,
  244. const RuntimeShape& input2_shape,
  245. const int64_t* input2_data,
  246. const RuntimeShape& output_shape, int64_t* output_data) {
  247. ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
  248. TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  249. TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  250. TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  251. NdArrayDesc<N> desc1;
  252. NdArrayDesc<N> desc2;
  253. NdArrayDesc<N> output_desc;
  254. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  255. &desc2);
  256. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  257. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  258. // col, channel), with extents (batches, height, width, depth), with the
  259. // trailing dimension changing most rapidly (channels has the smallest stride,
  260. // typically 1 element).
  261. //
  262. // In generated C code, we store arrays with the dimensions reversed. The
  263. // first dimension has smallest stride.
  264. //
  265. // We name our variables by their Tensorflow convention, but generate C code
  266. // nesting loops such that the innermost loop has the smallest stride for the
  267. // best cache behavior.
  268. auto sub_func = [&](int indexes[N]) {
  269. output_data[SubscriptToIndex(output_desc, indexes)] =
  270. ActivationFunctionWithMinMax(
  271. input1_data[SubscriptToIndex(desc1, indexes)] -
  272. input2_data[SubscriptToIndex(desc2, indexes)],
  273. params.int64_activation_min, params.int64_activation_max);
  274. };
  275. NDOpsHelper<N>(output_desc, sub_func);
  276. }
  277. template <typename T, int N = 5>
  278. void BroadcastSubSlow(const ArithmeticParams& params,
  279. const RuntimeShape& input1_shape, const T* input1_data,
  280. const RuntimeShape& input2_shape, const T* input2_data,
  281. const RuntimeShape& output_shape, T* output_data) {
  282. ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
  283. TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  284. TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  285. TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
  286. NdArrayDesc<N> desc1;
  287. NdArrayDesc<N> desc2;
  288. NdArrayDesc<N> output_desc;
  289. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  290. &desc2);
  291. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  292. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  293. // col, channel), with extents (batches, height, width, depth), with the
  294. // trailing dimension changing most rapidly (channels has the smallest stride,
  295. // typically 1 element).
  296. //
  297. // In generated C code, we store arrays with the dimensions reversed. The
  298. // first dimension has smallest stride.
  299. //
  300. // We name our variables by their Tensorflow convention, but generate C code
  301. // nesting loops such that the innermost loop has the smallest stride for the
  302. // best cache behavior.
  303. auto sub_func = [&](int indexes[N]) {
  304. output_data[SubscriptToIndex(output_desc, indexes)] =
  305. ActivationFunctionWithMinMax(
  306. input1_data[SubscriptToIndex(desc1, indexes)] -
  307. input2_data[SubscriptToIndex(desc2, indexes)],
  308. params.quantized_activation_min, params.quantized_activation_max);
  309. };
  310. NDOpsHelper<N>(output_desc, sub_func);
  311. }
  312. template <int N = 5>
  313. inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
  314. const RuntimeShape& input1_shape,
  315. const int16_t* input1_data,
  316. const RuntimeShape& input2_shape,
  317. const int16_t* input2_data,
  318. const RuntimeShape& output_shape,
  319. int16_t* output_data) {
  320. ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
  321. NdArrayDesc<N> desc1;
  322. NdArrayDesc<N> desc2;
  323. NdArrayDesc<N> output_desc;
  324. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  325. &desc2);
  326. CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
  327. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  328. // col, channel), with extents (batches, height, width, depth), with the
  329. // trailing dimension changing most rapidly (channels has the smallest stride,
  330. // typically 1 element).
  331. //
  332. // In generated C code, we store arrays with the dimensions reversed. The
  333. // first dimension has smallest stride.
  334. //
  335. // We name our variables by their Tensorflow convention, but generate C code
  336. // nesting loops such that the innermost loop has the smallest stride for the
  337. // best cache behavior.
  338. auto sub_func = [&](int indexes[N]) {
  339. const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
  340. const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
  341. const int32_t scaled_input1_val =
  342. gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
  343. const int32_t scaled_input2_val =
  344. gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
  345. const int32_t raw_output = scaled_input1_val - scaled_input2_val;
  346. const int32_t clamped_output =
  347. std::min(params.quantized_activation_max,
  348. std::max(params.quantized_activation_min, raw_output));
  349. output_data[SubscriptToIndex(output_desc, indexes)] =
  350. static_cast<int16_t>(clamped_output);
  351. };
  352. NDOpsHelper<N>(output_desc, sub_func);
  353. }
  354. // Element-wise Sub that can often be used for inner loop of broadcast sub as
  355. // well as the non-broadcast sub.
  356. inline void SubElementwise(int size, const ArithmeticParams& params,
  357. const uint8_t* input1_data,
  358. const uint8_t* input2_data, uint8_t* output_data) {
  359. TFLITE_DCHECK_GT(params.input1_offset, -256);
  360. TFLITE_DCHECK_GT(params.input2_offset, -256);
  361. TFLITE_DCHECK_LT(params.input1_offset, 256);
  362. TFLITE_DCHECK_LT(params.input2_offset, 256);
  363. for (int i = 0; i < size; ++i) {
  364. const int32_t input1_val = params.input1_offset + input1_data[i];
  365. const int32_t input2_val = params.input2_offset + input2_data[i];
  366. const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
  367. const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
  368. const int32_t scaled_input1_val =
  369. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  370. shifted_input1_val, params.input1_multiplier, params.input1_shift);
  371. const int32_t scaled_input2_val =
  372. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  373. shifted_input2_val, params.input2_multiplier, params.input2_shift);
  374. const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
  375. const int32_t raw_output =
  376. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  377. raw_sub, params.output_multiplier, params.output_shift) +
  378. params.output_offset;
  379. const int32_t clamped_output =
  380. std::min(params.quantized_activation_max,
  381. std::max(params.quantized_activation_min, raw_output));
  382. output_data[i] = static_cast<uint8_t>(clamped_output);
  383. }
  384. }
  385. // Element-wise add that can often be used for inner loop of broadcast add as
  386. // well as the non-broadcast add.
  387. inline void SubElementwise(int size, const ArithmeticParams& params,
  388. const int8_t* input1_data, const int8_t* input2_data,
  389. int8_t* output_data) {
  390. const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
  391. TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
  392. TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
  393. TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
  394. TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  395. for (int i = 0; i < size; ++i) {
  396. const int32_t input1_val = params.input1_offset + input1_data[i];
  397. const int32_t input2_val = params.input2_offset + input2_data[i];
  398. const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
  399. const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
  400. const int32_t scaled_input1_val =
  401. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  402. shifted_input1_val, params.input1_multiplier, params.input1_shift);
  403. const int32_t scaled_input2_val =
  404. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  405. shifted_input2_val, params.input2_multiplier, params.input2_shift);
  406. const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
  407. const int32_t raw_output =
  408. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  409. raw_sub, params.output_multiplier, params.output_shift) +
  410. params.output_offset;
  411. const int32_t clamped_output =
  412. std::min(params.quantized_activation_max,
  413. std::max(params.quantized_activation_min, raw_output));
  414. output_data[i] = static_cast<int8_t>(clamped_output);
  415. }
  416. }
  417. inline void Sub(const ArithmeticParams& params,
  418. const RuntimeShape& input1_shape, const uint8_t* input1_data,
  419. const RuntimeShape& input2_shape, const uint8_t* input2_data,
  420. const RuntimeShape& output_shape, uint8_t* output_data) {
  421. TFLITE_DCHECK_LE(params.quantized_activation_min,
  422. params.quantized_activation_max);
  423. const int flat_size =
  424. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  425. TFLITE_DCHECK_GT(params.input1_offset, -256);
  426. TFLITE_DCHECK_GT(params.input2_offset, -256);
  427. TFLITE_DCHECK_LT(params.input1_offset, 256);
  428. TFLITE_DCHECK_LT(params.input2_offset, 256);
  429. SubElementwise(flat_size, params, input1_data, input2_data, output_data);
  430. }
  431. inline void Sub(const ArithmeticParams& params,
  432. const RuntimeShape& input1_shape, const int8_t* input1_data,
  433. const RuntimeShape& input2_shape, const int8_t* input2_data,
  434. const RuntimeShape& output_shape, int8_t* output_data) {
  435. TFLITE_DCHECK_LE(params.quantized_activation_min,
  436. params.quantized_activation_max);
  437. const int flat_size =
  438. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  439. const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
  440. TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
  441. TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
  442. TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
  443. TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  444. SubElementwise(flat_size, params, input1_data, input2_data, output_data);
  445. }
  446. template <typename T>
  447. void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
  448. const T* input1_data, const RuntimeShape& input2_shape,
  449. const T* input2_data, const RuntimeShape& output_shape,
  450. T* output_data) {
  451. NdArrayDesc<4> desc1;
  452. NdArrayDesc<4> desc2;
  453. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  454. &desc2);
  455. const RuntimeShape extended_output_shape =
  456. RuntimeShape::ExtendedShape(4, output_shape);
  457. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  458. // col, channel), with extents (batches, height, width, depth), with the
  459. // trailing dimension changing most rapidly (channels has the smallest stride,
  460. // typically 1 element).
  461. //
  462. // In generated C code, we store arrays with the dimensions reversed. The
  463. // first dimension has smallest stride.
  464. //
  465. // We name our variables by their Tensorflow convention, but generate C code
  466. // nesting loops such that the innermost loop has the smallest stride for the
  467. // best cache behavior.
  468. for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
  469. for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
  470. for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
  471. for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
  472. output_data[Offset(extended_output_shape, b, y, x, c)] =
  473. input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
  474. input2_data[SubscriptToIndex(desc2, b, y, x, c)];
  475. }
  476. }
  477. }
  478. }
  479. }
  480. inline void SetActivationMinMax(const ArithmeticParams& params,
  481. int32_t* activation_min,
  482. int32_t* activation_max) {
  483. *activation_min = params.quantized_activation_min;
  484. *activation_max = params.quantized_activation_max;
  485. }
  486. inline void SetActivationMinMax(const ArithmeticParams& params,
  487. float* activation_min, float* activation_max) {
  488. *activation_min = params.float_activation_min;
  489. *activation_max = params.float_activation_max;
  490. }
  491. inline void SetActivationMinMax(const ArithmeticParams& params,
  492. int64_t* activation_min,
  493. int64_t* activation_max) {
  494. *activation_min = params.int64_activation_min;
  495. *activation_max = params.int64_activation_max;
  496. }
  497. template <typename T>
  498. inline void SubWithActivation(
  499. const ArithmeticParams& params, const RuntimeShape& input1_shape,
  500. const T* input1_data, const RuntimeShape& input2_shape,
  501. const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  502. ruy::profiler::ScopeLabel label("SubWithActivation");
  503. const int flat_size =
  504. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  505. T activation_min, activation_max;
  506. SetActivationMinMax(params, &activation_min, &activation_max);
  507. for (int i = 0; i < flat_size; ++i) {
  508. output_data[i] = ActivationFunctionWithMinMax(
  509. input1_data[i] - input2_data[i], activation_min, activation_max);
  510. }
  511. }
  512. } // namespace reference_ops
  513. } // namespace tflite
  514. #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_