LCOV - code coverage report
Current view: top level - port - cpl_levenshtein.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 33 35 94.3 %
Date: 2025-05-24 03:54:53 Functions: 2 2 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: MIT
       2             : // Copyright (c) 2019, Guilherme Agostinelli
       3             : // Ported from https://github.com/guilhermeagostinelli/levenshtein/blob/master/levenshtein.cpp
       4             : 
       5             : #include "cpl_levenshtein.h"
       6             : 
       7             : #include <algorithm>
       8             : #include <cstring>
       9             : #include <limits>
      10             : #include <vector>
      11             : 
      12             : /** Computes the Levenshtein distance between 2 words.
      13             :  *
      14             :  * If transpositionAllowed = true, then it is the Damerau-Levenshtein distance
      15             :  *
      16             :  * Return SIZE_T_MAX in case of error.
      17             :  */
      18      145006 : size_t CPLLevenshteinDistance(const char *word1, const char *word2,
      19             :                               bool transpositionAllowed)
      20             : {
      21      145006 :     const size_t size1 = strlen(word1);
      22      145006 :     const size_t size2 = strlen(word2);
      23             : 
      24             :     // If one of the words has zero length, the distance is equal to the size of the other word.
      25      145006 :     if (size1 == 0)
      26           1 :         return size2;
      27      145005 :     if (size2 == 0)
      28           1 :         return size1;
      29             : 
      30             :     // Would cause enormous amount of memory to be allocated
      31      145004 :     if (size1 >= 32768 || size2 >= 32768)
      32             :     {
      33           2 :         return strcmp(word1, word2) == 0 ? 0
      34           2 :                                          : std::numeric_limits<size_t>::max();
      35             :     }
      36             : 
      37             :     // Verification matrix i.e. 2D array which will store the calculated distance.
      38      145002 :     const size_t dimFastSize = size2 + 1;
      39      290004 :     std::vector<unsigned short> verif;
      40             :     try
      41             :     {
      42      145002 :         verif.resize((size1 + 1) * dimFastSize);
      43             :     }
      44           0 :     catch (const std::exception &)
      45             :     {
      46           0 :         return std::numeric_limits<size_t>::max();
      47             :     }
      48             : 
      49    33331200 :     const auto verifVal = [&verif, dimFastSize](size_t i,
      50    33331200 :                                                 size_t j) -> unsigned short &
      51    33331200 :     { return verif[i * dimFastSize + j]; };
      52             : 
      53             :     // Sets the first row and the first column of the verification matrix with the numerical order from 0 to the length of each word.
      54     1309930 :     for (size_t i = 0; i <= size1; i++)
      55     1164930 :         verifVal(i, 0) = static_cast<unsigned short>(i);
      56     1381220 :     for (size_t j = 0; j <= size2; j++)
      57     1236220 :         verifVal(0, j) = static_cast<unsigned short>(j);
      58             : 
      59             :     // Verification step / matrix filling.
      60     1164930 :     for (size_t i = 1; i <= size1; i++)
      61             :     {
      62     8695810 :         for (size_t j = 1; j <= size2; j++)
      63             :         {
      64             :             // Sets the modification cost.
      65             :             // 0 means no modification (i.e. equal letters) and 1 means that a modification is needed (i.e. unequal letters).
      66     7675880 :             const int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1;
      67             : 
      68             :             // Sets the current position of the matrix as the minimum value between a (deletion), b (insertion) and c (substitution).
      69             :             // a = the upper adjacent value plus 1: verif[i - 1][j] + 1
      70             :             // b = the left adjacent value plus 1: verif[i][j - 1] + 1
      71             :             // c = the upper left adjacent value plus the modification cost: verif[i - 1][j - 1] + cost
      72     7675880 :             verifVal(i, j) = std::min(
      73    23027600 :                 std::min(static_cast<unsigned short>(verifVal(i - 1, j) + 1),
      74     7675880 :                          static_cast<unsigned short>(verifVal(i, j - 1) + 1)),
      75    15351800 :                 static_cast<unsigned short>(verifVal(i - 1, j - 1) + cost));
      76             : 
      77             :             // Cf https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
      78             :             // (note that in the Wikipedia page, a[] and b[] are indexed from 1...
      79     7675880 :             if (transpositionAllowed && i > 1 && j > 1 &&
      80     5709710 :                 word1[i - 1] == word2[j - 2] && word1[i - 2] == word2[j - 1])
      81             :             {
      82       27184 :                 verifVal(i, j) = std::min(
      83       27184 :                     verifVal(i, j),
      84       54368 :                     static_cast<unsigned short>(verifVal(i - 2, j - 2) + 1));
      85             :             }
      86             :         }
      87             :     }
      88             : 
      89             :     // The last position of the matrix will contain the Levenshtein distance.
      90      145002 :     return verifVal(size1, size2);
      91             : }

Generated by: LCOV version 1.14