LCOV - code coverage report
Current view: top level - gcore - gdal_matrix.hpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 6 6 100.0 %
Date: 2025-12-21 22:14:19 Functions: 1 2 50.0 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  * Project:  GDAL Core
       3             :  * Purpose:  Utility functions for matrix multiplication
       4             :  * Author:   Even Rouault, <even dot rouault at spatialys.com>
       5             :  *
       6             :  ******************************************************************************
       7             :  * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
       8             :  *
       9             :  * SPDX-License-Identifier: MIT
      10             :  ****************************************************************************/
      11             : 
      12             : #ifndef GDAL_MATRIX_HPP
      13             : #define GDAL_MATRIX_HPP
      14             : 
      15             : #include "cpl_port.h"
      16             : #include <algorithm>
      17             : 
      18             : /************************************************************************/
      19             : /*               GDALMatrixMultiplyAByTransposeAUpperTriangle()         */
      20             : /************************************************************************/
      21             : 
      22             : // Compute res = A * A.transpose(), by filling only the upper triangle.
      23             : // Do that in a cache friendly way.
      24             : // We accumulate values into the output array, so generally the caller must
      25             : // have make sure to zero-initialize it before (unless they want to add into it)
      26             : template <class T>
      27             : // CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW because it seems the uses of openmp-simd
      28             : // causes that to happen
      29             : static CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW void
      30          24 : GDALMatrixMultiplyAByTransposeAUpperTriangle([[maybe_unused]] int nNumThreads,
      31             :                                              const T *A,  // rows * cols
      32             :                                              T *res,      // rows * rows
      33             :                                              int rows, size_t cols)
      34             : {
      35          24 :     constexpr int BLOCK_SIZE = 64;
      36          24 :     constexpr size_t BLOCK_SIZE_COLS = 256;
      37          24 :     constexpr T ZERO{0};
      38             : 
      39             : #ifdef HAVE_OPENMP
      40          24 : #pragma omp parallel for schedule(static) num_threads(nNumThreads)
      41             : #endif
      42             :     for (int64_t ii64 = 0; ii64 < rows; ii64 += BLOCK_SIZE)
      43             :     {
      44             :         const int ii = static_cast<int>(ii64);
      45             :         const int i_end = ii + std::min(BLOCK_SIZE, rows - ii);
      46             :         for (int jj = ii; jj < rows;
      47             :              jj += std::min(BLOCK_SIZE, rows - jj))  // upper triangle
      48             :         {
      49             :             const int j_end = jj + std::min(BLOCK_SIZE, rows - jj);
      50             :             for (size_t cc = 0; cc < cols; /* increment done at end of loop */)
      51             :             {
      52             :                 const size_t c_end = cc + std::min(BLOCK_SIZE_COLS, cols - cc);
      53             : 
      54             :                 for (int i = ii; i < i_end; ++i)
      55             :                 {
      56             :                     const T *const Ai = &A[i * cols];
      57             :                     int j = std::max(i, jj);
      58             :                     for (; j < j_end - 7; j += 8)
      59             :                     {
      60             :                         const T *const Ajp0 = A + (j + 0) * cols;
      61             :                         const T *const Ajp1 = A + (j + 1) * cols;
      62             :                         const T *const Ajp2 = A + (j + 2) * cols;
      63             :                         const T *const Ajp3 = A + (j + 3) * cols;
      64             :                         const T *const Ajp4 = A + (j + 4) * cols;
      65             :                         const T *const Ajp5 = A + (j + 5) * cols;
      66             :                         const T *const Ajp6 = A + (j + 6) * cols;
      67             :                         const T *const Ajp7 = A + (j + 7) * cols;
      68             : 
      69             :                         T dfSum0 = ZERO, dfSum1 = ZERO, dfSum2 = ZERO,
      70             :                           dfSum3 = ZERO, dfSum4 = ZERO, dfSum5 = ZERO,
      71             :                           dfSum6 = ZERO, dfSum7 = ZERO;
      72             : #ifdef HAVE_OPENMP_SIMD
      73             : #pragma omp simd reduction(+ : dfSum0, dfSum1, dfSum2, dfSum3, dfSum4, dfSum5, dfSum6, dfSum7)
      74             : #endif
      75             :                         for (size_t c = cc; c < c_end; ++c)
      76             :                         {
      77             :                             dfSum0 += Ai[c] * Ajp0[c];
      78             :                             dfSum1 += Ai[c] * Ajp1[c];
      79             :                             dfSum2 += Ai[c] * Ajp2[c];
      80             :                             dfSum3 += Ai[c] * Ajp3[c];
      81             :                             dfSum4 += Ai[c] * Ajp4[c];
      82             :                             dfSum5 += Ai[c] * Ajp5[c];
      83             :                             dfSum6 += Ai[c] * Ajp6[c];
      84             :                             dfSum7 += Ai[c] * Ajp7[c];
      85             :                         }
      86             : 
      87             :                         const auto nResOffset =
      88             :                             static_cast<size_t>(i) * rows + j;
      89             :                         res[nResOffset + 0] += dfSum0;
      90             :                         res[nResOffset + 1] += dfSum1;
      91             :                         res[nResOffset + 2] += dfSum2;
      92             :                         res[nResOffset + 3] += dfSum3;
      93             :                         res[nResOffset + 4] += dfSum4;
      94             :                         res[nResOffset + 5] += dfSum5;
      95             :                         res[nResOffset + 6] += dfSum6;
      96             :                         res[nResOffset + 7] += dfSum7;
      97             :                     }
      98             :                     for (; j < j_end; ++j)
      99             :                     {
     100             :                         const T *const Aj = A + j * cols;
     101             : 
     102             :                         T dfSum = ZERO;
     103             : #ifdef HAVE_OPENMP_SIMD
     104             : #pragma omp simd reduction(+ : dfSum)
     105             : #endif
     106             :                         for (size_t c = cc; c < c_end; ++c)
     107             :                         {
     108             :                             dfSum += Ai[c] * Aj[c];
     109             :                         }
     110             : 
     111             :                         res[static_cast<size_t>(i) * rows + j] += dfSum;
     112             :                     }
     113             :                 }
     114             : 
     115             :                 cc = c_end;
     116             :             }
     117             :         }
     118             :     }
     119          24 : }
     120             : 
     121             : #endif

Generated by: LCOV version 1.14