Line data Source code
1 : /****************************************************************************** 2 : * Project: GDAL Core 3 : * Purpose: Utility functions for matrix multiplication 4 : * Author: Even Rouault, <even dot rouault at spatialys.com> 5 : * 6 : ****************************************************************************** 7 : * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com> 8 : * 9 : * SPDX-License-Identifier: MIT 10 : ****************************************************************************/ 11 : 12 : #ifndef GDAL_MATRIX_HPP 13 : #define GDAL_MATRIX_HPP 14 : 15 : #include "cpl_port.h" 16 : #include <algorithm> 17 : 18 : /************************************************************************/ 19 : /* GDALMatrixMultiplyAByTransposeAUpperTriangle() */ 20 : /************************************************************************/ 21 : 22 : // Compute res = A * A.transpose(), by filling only the upper triangle. 23 : // Do that in a cache friendly way. 24 : // We accumulate values into the output array, so generally the caller must 25 : // have make sure to zero-initialize it before (unless they want to add into it) 26 : template <class T> 27 : // CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW because it seems the uses of openmp-simd 28 : // causes that to happen 29 : static CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW void 30 24 : GDALMatrixMultiplyAByTransposeAUpperTriangle([[maybe_unused]] int nNumThreads, 31 : const T *A, // rows * cols 32 : T *res, // rows * rows 33 : int rows, size_t cols) 34 : { 35 24 : constexpr int BLOCK_SIZE = 64; 36 24 : constexpr size_t BLOCK_SIZE_COLS = 256; 37 24 : constexpr T ZERO{0}; 38 : 39 : #ifdef HAVE_OPENMP 40 24 : #pragma omp parallel for schedule(static) num_threads(nNumThreads) 41 : #endif 42 : for (int64_t ii64 = 0; ii64 < rows; ii64 += BLOCK_SIZE) 43 : { 44 : const int ii = static_cast<int>(ii64); 45 : const int i_end = ii + std::min(BLOCK_SIZE, rows - ii); 46 : for (int jj = ii; jj < rows; 47 : jj += std::min(BLOCK_SIZE, rows - jj)) // upper triangle 48 : { 49 : const int j_end = jj + std::min(BLOCK_SIZE, rows - jj); 50 : for (size_t cc = 0; cc < cols; /* increment done at end of loop */) 51 : { 52 : const size_t c_end = cc + std::min(BLOCK_SIZE_COLS, cols - cc); 53 : 54 : for (int i = ii; i < i_end; ++i) 55 : { 56 : const T *const Ai = &A[i * cols]; 57 : int j = std::max(i, jj); 58 : for (; j < j_end - 7; j += 8) 59 : { 60 : const T *const Ajp0 = A + (j + 0) * cols; 61 : const T *const Ajp1 = A + (j + 1) * cols; 62 : const T *const Ajp2 = A + (j + 2) * cols; 63 : const T *const Ajp3 = A + (j + 3) * cols; 64 : const T *const Ajp4 = A + (j + 4) * cols; 65 : const T *const Ajp5 = A + (j + 5) * cols; 66 : const T *const Ajp6 = A + (j + 6) * cols; 67 : const T *const Ajp7 = A + (j + 7) * cols; 68 : 69 : T dfSum0 = ZERO, dfSum1 = ZERO, dfSum2 = ZERO, 70 : dfSum3 = ZERO, dfSum4 = ZERO, dfSum5 = ZERO, 71 : dfSum6 = ZERO, dfSum7 = ZERO; 72 : #ifdef HAVE_OPENMP_SIMD 73 : #pragma omp simd reduction(+ : dfSum0, dfSum1, dfSum2, dfSum3, dfSum4, dfSum5, dfSum6, dfSum7) 74 : #endif 75 : for (size_t c = cc; c < c_end; ++c) 76 : { 77 : dfSum0 += Ai[c] * Ajp0[c]; 78 : dfSum1 += Ai[c] * Ajp1[c]; 79 : dfSum2 += Ai[c] * Ajp2[c]; 80 : dfSum3 += Ai[c] * Ajp3[c]; 81 : dfSum4 += Ai[c] * Ajp4[c]; 82 : dfSum5 += Ai[c] * Ajp5[c]; 83 : dfSum6 += Ai[c] * Ajp6[c]; 84 : dfSum7 += Ai[c] * Ajp7[c]; 85 : } 86 : 87 : const auto nResOffset = 88 : static_cast<size_t>(i) * rows + j; 89 : res[nResOffset + 0] += dfSum0; 90 : res[nResOffset + 1] += dfSum1; 91 : res[nResOffset + 2] += dfSum2; 92 : res[nResOffset + 3] += dfSum3; 93 : res[nResOffset + 4] += dfSum4; 94 : res[nResOffset + 5] += dfSum5; 95 : res[nResOffset + 6] += dfSum6; 96 : res[nResOffset + 7] += dfSum7; 97 : } 98 : for (; j < j_end; ++j) 99 : { 100 : const T *const Aj = A + j * cols; 101 : 102 : T dfSum = ZERO; 103 : #ifdef HAVE_OPENMP_SIMD 104 : #pragma omp simd reduction(+ : dfSum) 105 : #endif 106 : for (size_t c = cc; c < c_end; ++c) 107 : { 108 : dfSum += Ai[c] * Aj[c]; 109 : } 110 : 111 : res[static_cast<size_t>(i) * rows + j] += dfSum; 112 : } 113 : } 114 : 115 : cc = c_end; 116 : } 117 : } 118 : } 119 24 : } 120 : 121 : #endif