|
|
@@ -0,0 +1,774 @@
|
|
|
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
|
+
|
|
|
+Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
+you may not use this file except in compliance with the License.
|
|
|
+You may obtain a copy of the License at
|
|
|
+
|
|
|
+ http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+
|
|
|
+Unless required by applicable law or agreed to in writing, software
|
|
|
+distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+See the License for the specific language governing permissions and
|
|
|
+limitations under the License.
|
|
|
+==============================================================================*/
|
|
|
+#include <algorithm>
|
|
|
+#include <cmath>
|
|
|
+#include <cstdint>
|
|
|
+#include <cstring>
|
|
|
+#include <limits>
|
|
|
+#include <utility>
|
|
|
+
|
|
|
+#include "fixedpoint/fixedpoint.h"
|
|
|
+#include "tensorflow/lite/kernels/internal/common.h"
|
|
|
+#include "tensorflow/lite/kernels/internal/compatibility.h"
|
|
|
+#include "tensorflow/lite/kernels/internal/cppmath.h"
|
|
|
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
|
|
+
|
|
|
+#if defined(_MSC_VER)
|
|
|
+#define __restrict__ __restrict
|
|
|
+#endif
|
|
|
+
|
|
|
+namespace tflite {
|
|
|
+namespace tensor_utils {
|
|
|
+
|
|
|
+namespace {
|
|
|
+const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
|
|
|
+const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
|
|
|
+} // namespace
|
|
|
+
|
|
|
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
|
|
+ int8_t* quantized_values, float* min_value,
|
|
|
+ float* max_value, float* scaling_factor) {
|
|
|
+ auto minmax = std::minmax_element(values, values + size);
|
|
|
+ *min_value = *minmax.first;
|
|
|
+ *max_value = *minmax.second;
|
|
|
+
|
|
|
+ PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
|
|
|
+ *max_value, scaling_factor);
|
|
|
+}
|
|
|
+
|
|
|
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
|
|
+ int8_t* quantized_values, float min_value,
|
|
|
+ float max_value, float* scaling_factor) {
|
|
|
+ const int32_t kScale = 127;
|
|
|
+ const float range = std::max(std::abs(min_value), std::abs(max_value));
|
|
|
+ if (range == 0) {
|
|
|
+ memset(quantized_values, 0, size * sizeof(int8_t));
|
|
|
+ *scaling_factor = 1;
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ *scaling_factor = range / kScale;
|
|
|
+ const float scaling_factor_inv = kScale / range;
|
|
|
+ for (int i = 0; i < size; ++i) {
|
|
|
+ const int32_t quantized_value =
|
|
|
+ static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
|
|
|
+ // Clamp: just in case some odd numeric offset.
|
|
|
+ quantized_values[i] = static_cast<int8_t>(
|
|
|
+ std::min(kScale, std::max(-kScale, quantized_value)));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableAsymmetricQuantizeFloats(const float* values, const int size,
|
|
|
+ int8_t* quantized_values,
|
|
|
+ float* scaling_factor, int32_t* offset) {
|
|
|
+ const int32_t kMinScale = -128;
|
|
|
+ const int32_t kMaxScale = 127;
|
|
|
+ const double qmin_double = kMinScale;
|
|
|
+ const double qmax_double = kMaxScale;
|
|
|
+ const auto minmax = std::minmax_element(values, values + size);
|
|
|
+ const double rmin = std::fmin(0, *minmax.first);
|
|
|
+ const double rmax = std::fmax(0, *minmax.second);
|
|
|
+ if (rmin == rmax) {
|
|
|
+ memset(quantized_values, 0, size * sizeof(int8_t));
|
|
|
+ *scaling_factor = 1;
|
|
|
+ *offset = 0;
|
|
|
+ return;
|
|
|
+ } else {
|
|
|
+ double scale = (rmax - rmin) / (qmax_double - qmin_double);
|
|
|
+ const double zero_point_from_min = qmin_double - rmin / scale;
|
|
|
+ const double zero_point_from_max = qmax_double - rmax / scale;
|
|
|
+ const double zero_point_from_min_error =
|
|
|
+ std::abs(qmin_double) + std::abs(rmin / scale);
|
|
|
+ const double zero_point_from_max_error =
|
|
|
+ std::abs(qmax_double) + std::abs(rmax / scale);
|
|
|
+ const double zero_point_double =
|
|
|
+ zero_point_from_min_error < zero_point_from_max_error
|
|
|
+ ? zero_point_from_min
|
|
|
+ : zero_point_from_max;
|
|
|
+ int8_t nudged_zero_point = 0;
|
|
|
+ if (zero_point_double <= qmin_double) {
|
|
|
+ nudged_zero_point = kMinScale;
|
|
|
+ } else if (zero_point_double >= qmax_double) {
|
|
|
+ nudged_zero_point = kMaxScale;
|
|
|
+ } else {
|
|
|
+ nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
|
|
|
+ }
|
|
|
+ *scaling_factor = scale;
|
|
|
+ *offset = nudged_zero_point;
|
|
|
+ }
|
|
|
+ const float scaling_factor_inv = 1.0f / *scaling_factor;
|
|
|
+ for (int i = 0; i < size; ++i) {
|
|
|
+ const int32_t quantized_value = static_cast<int32_t>(
|
|
|
+ TfLiteRound(*offset + values[i] * scaling_factor_inv));
|
|
|
+ quantized_values[i] =
|
|
|
+ std::min(kMaxScale, std::max(kMinScale, quantized_value));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
|
|
+ int m_rows, int m_cols,
|
|
|
+ const float* vector,
|
|
|
+ int n_batch, float* result) {
|
|
|
+ float* result_in_batch = result;
|
|
|
+ for (int b = 0; b < n_batch; b++) {
|
|
|
+ const float* matrix_ptr = matrix;
|
|
|
+ for (int r = 0; r < m_rows; r++) {
|
|
|
+ float dot_prod = 0.0f;
|
|
|
+ const float* vector_in_batch = vector + b * m_cols;
|
|
|
+ for (int c = 0; c < m_cols; c++) {
|
|
|
+ dot_prod += *matrix_ptr++ * *vector_in_batch++;
|
|
|
+ }
|
|
|
+ *result_in_batch += dot_prod;
|
|
|
+ ++result_in_batch;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
|
|
+ const int8_t* __restrict__ vectors, const float* scaling_factors,
|
|
|
+ int n_batch, float* __restrict__ result) {
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
|
|
+ const float batch_scaling_factor = scaling_factors[batch];
|
|
|
+ // Get the address of the first row.
|
|
|
+ const int8_t* row_ptr = matrix;
|
|
|
+ for (int row = 0; row < m_rows; ++row) {
|
|
|
+ // Initialize the dot product sum for the row to 0.
|
|
|
+ int32_t dotprod = 0;
|
|
|
+#if defined(__GNUC__)
|
|
|
+ // Prefetch the row to cache.
|
|
|
+ __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
|
|
+ 3 /* temporal locality */);
|
|
|
+#endif
|
|
|
+ for (int col = 0; col < m_cols; ++col, ++row_ptr) {
|
|
|
+ dotprod += (*row_ptr) * (vectors[col]);
|
|
|
+ } // for col
|
|
|
+ *result += dotprod * batch_scaling_factor;
|
|
|
+ ++result;
|
|
|
+ } // for row
|
|
|
+ } // for batch
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
|
|
+ const int8_t* __restrict__ vectors, const float* scaling_factors,
|
|
|
+ int n_batch, float* __restrict__ result, const float* per_channel_scale,
|
|
|
+ const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
|
|
|
+ bool* compute_row_sums, CpuBackendContext* context) {
|
|
|
+ if (input_offset == nullptr) {
|
|
|
+ PortableMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ if (!compute_row_sums || *compute_row_sums) {
|
|
|
+ PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
|
|
|
+ if (compute_row_sums) {
|
|
|
+ *compute_row_sums = false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
|
|
+ const float batch_scaling_factor = scaling_factors[batch];
|
|
|
+ const int32_t batch_offset = input_offset[batch];
|
|
|
+ const int8_t* row_ptr = matrix;
|
|
|
+ for (int row = 0; row < m_rows; ++row) {
|
|
|
+ int32_t dotprod = 0;
|
|
|
+ float scale = batch_scaling_factor;
|
|
|
+ if (per_channel_scale) {
|
|
|
+ scale *= per_channel_scale[row];
|
|
|
+ }
|
|
|
+#if defined(__GNUC__)
|
|
|
+ // Prefetch the row to cache.
|
|
|
+ __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
|
|
+ 3 /* temporal locality */);
|
|
|
+#endif
|
|
|
+ for (int col = 0; col < m_cols; ++col, ++row_ptr) {
|
|
|
+ dotprod += (*row_ptr) * vectors[col];
|
|
|
+ } // for col
|
|
|
+ dotprod -= row_sums[row] * batch_offset;
|
|
|
+ *result += dotprod * scale;
|
|
|
+ ++result;
|
|
|
+ } // for row
|
|
|
+ } // for batch
|
|
|
+}
|
|
|
+
|
|
|
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
|
|
|
+ const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
|
|
+ const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
|
|
+ const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
|
|
|
+ const int kBlockSize = 4;
|
|
|
+ TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
|
|
|
+ for (int batch = 0; batch < n_batch; batch++) {
|
|
|
+ const float* matrix_ptr = matrix;
|
|
|
+ for (int row = 0; row < m_rows; row++) {
|
|
|
+ float dot_prod = 0.0f;
|
|
|
+ const float* vector_in_batch = vector + batch * m_cols;
|
|
|
+ for (int i = segments[row]; i < segments[row + 1]; i++) {
|
|
|
+ const int block_start_index = indices[i] * kBlockSize;
|
|
|
+ const float* vector_block_in_batch_ptr =
|
|
|
+ vector_in_batch + block_start_index;
|
|
|
+ for (int c = 0; c < kBlockSize; c++) {
|
|
|
+ dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result[batch * m_rows + row] += dot_prod;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
|
|
+ int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
|
|
+ float* __restrict__ result) {
|
|
|
+ const int kBlockSize = 16;
|
|
|
+ TFLITE_DCHECK_EQ( // NOLINT
|
|
|
+ m_cols % kBlockSize, 0);
|
|
|
+ for (int batch = 0; batch < n_batch; batch++) {
|
|
|
+ const float* matrix_ptr = matrix;
|
|
|
+ const uint8_t* ledger_ptr = ledger;
|
|
|
+ for (int row = 0; row < m_rows; row++) {
|
|
|
+ float dot_prod = 0.0f;
|
|
|
+ int num_nonzero_blocks = *ledger_ptr++;
|
|
|
+ if (num_nonzero_blocks > 0) {
|
|
|
+ const float* vector_in_batch = vector + batch * m_cols;
|
|
|
+ for (int i = 0; i < num_nonzero_blocks; i++) {
|
|
|
+ const int block_start_index = *ledger_ptr++ * kBlockSize;
|
|
|
+ const float* vector_block_in_batch_ptr =
|
|
|
+ vector_in_batch + block_start_index;
|
|
|
+ for (int c = 0; c < kBlockSize; c++) {
|
|
|
+ dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result[batch * m_rows + row] += dot_prod;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
|
|
+ const int m_cols, const int8_t* __restrict__ vectors,
|
|
|
+ const float* scaling_factors, int n_batch, float* __restrict__ result) {
|
|
|
+ static const int kBlockSize = 16;
|
|
|
+ TFLITE_DCHECK_EQ( // NOLINT
|
|
|
+ m_cols % kBlockSize, 0);
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
|
|
|
+ const float batch_scaling_factor = scaling_factors[batch];
|
|
|
+ const uint8_t* ledger_ptr = ledger;
|
|
|
+ // Get the address of the first row.
|
|
|
+ const int8_t* row_ptr = matrix;
|
|
|
+ for (int row = 0; row < m_rows; ++row) {
|
|
|
+ // Initialize the dot product sum for the row to 0.
|
|
|
+ int32_t dotprod = 0;
|
|
|
+#if defined(__GNUC__)
|
|
|
+ // Prefetch the row to cache.
|
|
|
+ __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
|
|
|
+ 3 /* temporal locality */);
|
|
|
+#endif
|
|
|
+ int num_nonzero_blocks = *ledger_ptr++;
|
|
|
+ for (int i = 0; i < num_nonzero_blocks; i++) {
|
|
|
+ const int block_start_index = *ledger_ptr++ * kBlockSize;
|
|
|
+ const int8_t* vector_block_ptr = vectors + block_start_index;
|
|
|
+ for (int c = 0; c < kBlockSize; c++) {
|
|
|
+ dotprod += (*row_ptr++) * (*vector_block_ptr++);
|
|
|
+ } // for block
|
|
|
+ } // for num_nonzero_blocks
|
|
|
+ result[batch * m_rows + row] += dotprod * batch_scaling_factor;
|
|
|
+ } // for row
|
|
|
+ } // for batch
|
|
|
+}
|
|
|
+
|
|
|
+template <typename T>
|
|
|
+void PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
|
|
+ const int8_t* input, const int32_t* bias,
|
|
|
+ const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
|
|
+ int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
|
|
+ T* output) {
|
|
|
+ const int16_t output_max = std::numeric_limits<T>::max();
|
|
|
+ const int16_t output_min = std::numeric_limits<T>::min();
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int row = 0; row < n_output; ++row) {
|
|
|
+ int32_t acc = bias[row];
|
|
|
+ for (int col = 0; col < n_input; ++col) {
|
|
|
+ int8_t input_val = input[batch * n_input + col];
|
|
|
+ int8_t weights_val = input_to_gate_weights[row * n_input + col];
|
|
|
+ acc += input_val * weights_val;
|
|
|
+ }
|
|
|
+ acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
|
|
+ acc += output_zp;
|
|
|
+ acc += output[batch * n_output + row];
|
|
|
+ if (acc > output_max) {
|
|
|
+ acc = output_max;
|
|
|
+ }
|
|
|
+ if (acc < output_min) {
|
|
|
+ acc = output_min;
|
|
|
+ }
|
|
|
+ output[batch * n_output + row] = static_cast<T>(acc);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ const int8_t* input, const int32_t* bias,
|
|
|
+ const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
|
|
+ int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
|
|
+ int32_t* scratch, int16_t* output, CpuBackendContext* context) {
|
|
|
+ PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
|
|
+ input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
|
|
|
+ n_output, output_zp, output);
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiplyAccumulate(
|
|
|
+ const int8_t* input, const int32_t* bias,
|
|
|
+ const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
|
|
+ int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
|
|
+ int32_t* scratch, int8_t* output, CpuBackendContext* context) {
|
|
|
+ PortableMatrixBatchVectorMultiplyAccumulateImpl(
|
|
|
+ input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
|
|
|
+ n_output, output_zp, output);
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
|
|
|
+ int32_t input_zeropoint,
|
|
|
+ const int8_t* input_to_gate_weights,
|
|
|
+ int32_t input_to_gate_effective_scale_a,
|
|
|
+ int32_t input_to_gate_effective_scale_b,
|
|
|
+ int32_t n_batch, int32_t n_input,
|
|
|
+ int32_t n_cell, int8_t* gate_output,
|
|
|
+ int8_t gate_output_zp) {
|
|
|
+ const int32_t int8_max = std::numeric_limits<int8_t>::max();
|
|
|
+ const int32_t int8_min = std::numeric_limits<int8_t>::min();
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int row = 0; row < n_cell; ++row) {
|
|
|
+ int32_t acc = 0;
|
|
|
+ for (int col = 0; col < n_input; ++col) {
|
|
|
+ int32_t input_val = input[batch * n_input + col];
|
|
|
+ int8_t weights_val = input_to_gate_weights[row * n_input + col];
|
|
|
+ acc += (input_val - input_zeropoint) * weights_val;
|
|
|
+ }
|
|
|
+ acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
|
|
|
+ input_to_gate_effective_scale_b);
|
|
|
+ acc += gate_output_zp;
|
|
|
+ if (acc > int8_max) {
|
|
|
+ acc = int8_max;
|
|
|
+ }
|
|
|
+ if (acc < int8_min) {
|
|
|
+ acc = int8_min;
|
|
|
+ }
|
|
|
+ gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixBatchVectorMultiply(
|
|
|
+ const int16_t* hidden, const int8_t* hidden_to_output_weights,
|
|
|
+ int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
|
|
|
+ const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
|
|
|
+ int32_t n_output, int32_t output_zp, int8_t* proj_output) {
|
|
|
+ const int16_t int8_max = std::numeric_limits<int8_t>::max();
|
|
|
+ const int16_t int8_min = std::numeric_limits<int8_t>::min();
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int row = 0; row < n_output; ++row) {
|
|
|
+ int64_t acc = gate_bias[row];
|
|
|
+ for (int col = 0; col < n_hidden; ++col) {
|
|
|
+ int16_t input_val = hidden[batch * n_hidden + col];
|
|
|
+ int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
|
|
|
+ int64_t curr = acc;
|
|
|
+ acc += input_val * weights_val;
|
|
|
+ if (input_val * weights_val > 0 && acc < curr) {
|
|
|
+ acc = std::numeric_limits<int32_t>::max();
|
|
|
+ }
|
|
|
+ if (input_val * weights_val < 0 && acc > curr) {
|
|
|
+ acc = std::numeric_limits<int32_t>::min();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
|
|
|
+ proj_effective_scale_b);
|
|
|
+ acc += output_zp;
|
|
|
+ if (acc > int8_max) {
|
|
|
+ acc = int8_max;
|
|
|
+ }
|
|
|
+ if (acc < int8_min) {
|
|
|
+ acc = int8_min;
|
|
|
+ }
|
|
|
+ proj_output[batch * n_output + row] = acc;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableApplyLayerNorm(const int16_t* input,
|
|
|
+ const int16_t* layer_norm_weights,
|
|
|
+ const int32_t* bias, int32_t layer_norm_scale_a,
|
|
|
+ int32_t layer_norm_scale_b, int32_t variance_limit,
|
|
|
+ int n_batch, int n_input, int16_t* output) {
|
|
|
+ // The square of std::pow(2, 10), which is the extra factor that makes sure
|
|
|
+ // normalized values has enough resolution.
|
|
|
+ static const int kTwoToPower20 = 1 << 20;
|
|
|
+ for (int i = 0; i < n_batch; ++i) {
|
|
|
+ int64_t sum = 0;
|
|
|
+ int64_t sum_sq = 0;
|
|
|
+ for (int j = 0; j < n_input; ++j) {
|
|
|
+ const int32_t index = i * n_input + j;
|
|
|
+ int32_t val = static_cast<int32_t>(input[index]);
|
|
|
+ sum += val;
|
|
|
+ sum_sq += val * val;
|
|
|
+ }
|
|
|
+ int32_t mean =
|
|
|
+ static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
|
|
|
+ // TODO(b/173994730): Avoids overflow but only works for POT n_input.
|
|
|
+ int32_t temp = kTwoToPower20 / n_input;
|
|
|
+ int64_t variance =
|
|
|
+ sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
|
|
|
+ int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
|
|
|
+ if (variance2 < 1) {
|
|
|
+ variance2 = variance_limit;
|
|
|
+ }
|
|
|
+ int32_t stddev_inverse_a;
|
|
|
+ int stddev_inverse_b;
|
|
|
+ GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
|
|
|
+ &stddev_inverse_a, &stddev_inverse_b);
|
|
|
+
|
|
|
+ for (int j = 0; j < n_input; ++j) {
|
|
|
+ const int32_t index = i * n_input + j;
|
|
|
+ int32_t val = static_cast<int32_t>(input[index]);
|
|
|
+ int32_t shifted = 1024 * val - mean;
|
|
|
+ int32_t rescaled = MultiplyByQuantizedMultiplier(
|
|
|
+ shifted, stddev_inverse_a, stddev_inverse_b);
|
|
|
+ // TODO(jianlijianli): Saturate this.
|
|
|
+ int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
|
|
|
+ int32_t val4 =
|
|
|
+ static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
|
|
|
+ int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
|
|
|
+ layer_norm_scale_b + 12);
|
|
|
+ val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
|
|
|
+ output[index] = static_cast<int16_t>(val5);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableApplyLayerNormFloat(const int16_t* input,
|
|
|
+ const int16_t* layer_norm_weights,
|
|
|
+ int32_t layer_norm_scale_a,
|
|
|
+ int32_t layer_norm_scale_b,
|
|
|
+ const int32_t* bias, int n_batch, int n_input,
|
|
|
+ int16_t* output) {
|
|
|
+ const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
|
|
+ const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
|
|
+ const float layer_norm_scale =
|
|
|
+ layer_norm_scale_a *
|
|
|
+ std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
|
|
|
+ const float bias_scale =
|
|
|
+ static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
|
|
|
+
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ float sum = 0.0f;
|
|
|
+ float sum_sq = 0.0f;
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ const float value = static_cast<float>(input[index]);
|
|
|
+ sum += value;
|
|
|
+ sum_sq += value * value;
|
|
|
+ }
|
|
|
+ const float mean = sum / n_input;
|
|
|
+ float stddev_inv = 0.0f;
|
|
|
+ const float variance = sum_sq / n_input - mean * mean;
|
|
|
+ if (variance == 0) {
|
|
|
+ stddev_inv = 1.0f / std::sqrt(1e-8f);
|
|
|
+ } else {
|
|
|
+ stddev_inv = 1.0f / std::sqrt(variance);
|
|
|
+ }
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ const float normalized_value =
|
|
|
+ (static_cast<float>(input[index]) - mean) * stddev_inv;
|
|
|
+ const float weighted_normalized_value =
|
|
|
+ normalized_value * layer_norm_weights[i] * layer_norm_scale +
|
|
|
+ bias[i] * bias_scale;
|
|
|
+ const int32_t quant_output = static_cast<int32_t>(std::round(
|
|
|
+ weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
|
|
|
+ output[index] = std::min(int16_max, std::max(int16_min, quant_output));
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
|
|
|
+ int32_t scalar, int32_t n_row,
|
|
|
+ int32_t n_col, int32_t* output) {
|
|
|
+ for (int i = 0; i < n_row; ++i) {
|
|
|
+ int32_t row_sum = 0;
|
|
|
+ for (int j = 0; j < n_col; ++j) {
|
|
|
+ row_sum += *matrix++;
|
|
|
+ }
|
|
|
+ output[i] += row_sum * scalar;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
|
|
|
+ int32_t n_input, int16_t* output) {
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int c = 0; c < n_input; c++) {
|
|
|
+ using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
|
|
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
|
+ const int index = batch * n_input + c;
|
|
|
+ F3 sigmoid_input = F3::FromRaw(input[index]);
|
|
|
+ F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
|
|
|
+ output[index] = sigmoid_output.raw();
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
|
|
|
+ int32_t n_input, int16_t* output) {
|
|
|
+ const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
|
|
+ const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ const float float_input =
|
|
|
+ input[index] * static_cast<float>(std::pow(2, -12));
|
|
|
+ const float float_output = 1.0f / (1.0f + std::exp(-float_input));
|
|
|
+ const int32_t quant_output = static_cast<int32_t>(
|
|
|
+ float_output * static_cast<float>(std::pow(2, 15)));
|
|
|
+ const int32_t quant_output_clamped =
|
|
|
+ std::min(int16_max, std::max(int16_min, quant_output));
|
|
|
+ output[index] = static_cast<int16_t>(quant_output_clamped);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+template <int IntegerBits>
|
|
|
+void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
|
|
|
+ int32_t n_input, int16_t* output) {
|
|
|
+ using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
|
|
|
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ FX tanh_input = FX::FromRaw(input[index]);
|
|
|
+ F0 tanh_output = gemmlowp::tanh(tanh_input);
|
|
|
+ output[index] = tanh_output.raw();
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
|
|
|
+ int32_t n_batch, int32_t n_input, int16_t* output) {
|
|
|
+ assert(integer_bits <= 6);
|
|
|
+#define DISPATCH_TANH(i) \
|
|
|
+ case i: \
|
|
|
+ PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
|
|
|
+ break;
|
|
|
+ switch (integer_bits) {
|
|
|
+ DISPATCH_TANH(0);
|
|
|
+ DISPATCH_TANH(1);
|
|
|
+ DISPATCH_TANH(2);
|
|
|
+ DISPATCH_TANH(3);
|
|
|
+ DISPATCH_TANH(4);
|
|
|
+ DISPATCH_TANH(5);
|
|
|
+ DISPATCH_TANH(6);
|
|
|
+ default:
|
|
|
+ return;
|
|
|
+ }
|
|
|
+#undef DISPATCH_TANH
|
|
|
+}
|
|
|
+
|
|
|
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
|
|
|
+ int32_t n_input, int32_t integer_bits,
|
|
|
+ int16_t* output) {
|
|
|
+ const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
|
|
+ const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
|
|
+ const double two = 2.0;
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ const float float_input =
|
|
|
+ input[index] * std::pow(two, static_cast<double>(integer_bits));
|
|
|
+ const float float_output = std::tanh(float_input);
|
|
|
+ const int32_t quant_output = static_cast<int32_t>(
|
|
|
+ float_output * static_cast<float>(std::pow(2, 15)));
|
|
|
+ const int32_t quant_output_clamped =
|
|
|
+ std::min(int16_max, std::max(int16_min, quant_output));
|
|
|
+ output[index] = static_cast<int16_t>(quant_output_clamped);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
|
|
+ int n_batch, int n_input, int shift, int16_t* output) {
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ const int16_t a = input_1[index];
|
|
|
+ const int16_t b = input_2[index];
|
|
|
+ const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
|
|
|
+ output[index] =
|
|
|
+ static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
|
|
|
+ int32_t multiplier, int32_t shift, int32_t n_batch,
|
|
|
+ int32_t n_input, int32_t output_zp, int8_t* output) {
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ const int16_t a = input_1[index];
|
|
|
+ const int16_t b = input_2[index];
|
|
|
+ int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
|
|
|
+ value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
|
|
|
+ value -= output_zp;
|
|
|
+ value = std::min(std::max(static_cast<int32_t>(-128), value),
|
|
|
+ static_cast<int32_t>(127));
|
|
|
+
|
|
|
+ output[index] = static_cast<int8_t>(value);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
|
|
|
+ int n_batch, int n_input, int16_t* output) {
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ for (int i = 0; i < n_input; ++i) {
|
|
|
+ const int index = batch * n_input + i;
|
|
|
+ int32_t sum = input_1[index] + input_2[index];
|
|
|
+ const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
|
|
|
+ output[index] = static_cast<int16_t>(sum_clamped);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
|
|
+ int v_size) {
|
|
|
+ float result = 0.0;
|
|
|
+ for (int v = 0; v < v_size; v++) {
|
|
|
+ result += *vector1++ * *vector2++;
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+}
|
|
|
+
|
|
|
+namespace {
|
|
|
+inline int32_t VectorVectorDotProduct(const int16_t* vector1,
|
|
|
+ const int16_t* vector2, int v_size) {
|
|
|
+ int32_t result = 0;
|
|
|
+ for (int v = 0; v < v_size; v++) {
|
|
|
+ result += *vector1++ * *vector2++;
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+}
|
|
|
+} // namespace
|
|
|
+
|
|
|
+void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
|
|
+ const int16_t* vector2,
|
|
|
+ int v_size, int n_batch,
|
|
|
+ int32_t* result) {
|
|
|
+ for (int b = 0; b < n_batch; b++) {
|
|
|
+ result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
|
|
|
+ vector1 += v_size;
|
|
|
+ vector2 += v_size;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableVectorBatchVectorCwiseProductAccumulate(
|
|
|
+ const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
|
|
|
+ int32_t multiplier, int shift, int16_t* result) {
|
|
|
+ for (int b = 0; b < n_batch; b++) {
|
|
|
+ for (int v = 0; v < v_size; v++) {
|
|
|
+ int32_t prod = vector[v] * *batch_vector++;
|
|
|
+ prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
|
|
|
+ int32_t output = prod + *result;
|
|
|
+ output = std::max(std::min(static_cast<int32_t>(32767), output),
|
|
|
+ static_cast<int32_t>(-32768));
|
|
|
+ *result++ = output;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableSub1Vector(const float* vector, int v_size, float* result) {
|
|
|
+ for (int v = 0; v < v_size; v++) {
|
|
|
+ *result++ = 1.0f - *vector++;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
|
|
|
+ static const int16_t kOne = 32767;
|
|
|
+ for (int v = 0; v < v_size; v++) {
|
|
|
+ *result++ = kOne - *vector++;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
|
|
|
+ const float scale, float* result) {
|
|
|
+ for (int v = 0; v < v_size; ++v) {
|
|
|
+ *result++ = scale * *vector++;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
|
|
|
+ float* __restrict__ output_vector,
|
|
|
+ int v_size, int n_batch) {
|
|
|
+ for (int batch = 0; batch < n_batch; ++batch) {
|
|
|
+ float sum = 0.0f;
|
|
|
+ for (int i = 0; i < v_size; ++i) {
|
|
|
+ sum += input_vector[i];
|
|
|
+ }
|
|
|
+ const float mean = sum / v_size;
|
|
|
+ float sum_diff_sq = 0.0f;
|
|
|
+ for (int i = 0; i < v_size; ++i) {
|
|
|
+ const float diff = input_vector[i] - mean;
|
|
|
+ sum_diff_sq += diff * diff;
|
|
|
+ }
|
|
|
+ const float variance = sum_diff_sq / v_size;
|
|
|
+ constexpr float kNormalizationConstant = 1e-8f;
|
|
|
+ const float stddev_inv =
|
|
|
+ 1.0f / std::sqrt(variance + kNormalizationConstant);
|
|
|
+ for (int i = 0; i < v_size; ++i) {
|
|
|
+ output_vector[i] = (input_vector[i] - mean) * stddev_inv;
|
|
|
+ }
|
|
|
+ input_vector += v_size;
|
|
|
+ output_vector += v_size;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
|
|
+ const int8_t* recurrent, int8_t recurrent_zp,
|
|
|
+ int32_t input_effective_scale_a,
|
|
|
+ int32_t input_effective_scale_b,
|
|
|
+ int32_t recurrent_effective_scale_a,
|
|
|
+ int32_t recurrent_effective_scale_b,
|
|
|
+ int32_t n_batch, int32_t n_cell,
|
|
|
+ int16_t* output) {
|
|
|
+ const int32_t int16_max = std::numeric_limits<int16_t>::max();
|
|
|
+ const int32_t int16_min = std::numeric_limits<int16_t>::min();
|
|
|
+ for (int i = 0; i < n_batch * n_cell; ++i) {
|
|
|
+ int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
|
|
|
+ int32_t h =
|
|
|
+ static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
|
|
|
+ int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
|
|
|
+ input_effective_scale_b);
|
|
|
+ int32_t h_scaled = MultiplyByQuantizedMultiplier(
|
|
|
+ h, recurrent_effective_scale_a, recurrent_effective_scale_b);
|
|
|
+ int32_t y = h_scaled + x_scaled;
|
|
|
+ if (y > int16_max) {
|
|
|
+ y = int16_max;
|
|
|
+ }
|
|
|
+ if (y < int16_min) {
|
|
|
+ y = int16_min;
|
|
|
+ }
|
|
|
+ output[i] = static_cast<int16_t>(y);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+} // namespace tensor_utils
|
|
|
+} // namespace tflite
|