Line data Source code
1 : // SPDX-License-Identifier: MIT 2 : // Copyright (c) 2019, Guilherme Agostinelli 3 : // Ported from https://github.com/guilhermeagostinelli/levenshtein/blob/master/levenshtein.cpp 4 : 5 : #include "cpl_levenshtein.h" 6 : 7 : #include <algorithm> 8 : #include <cstring> 9 : #include <limits> 10 : #include <vector> 11 : 12 : /** Computes the Levenshtein distance between 2 words. 13 : * 14 : * If transpositionAllowed = true, then it is the Damerau-Levenshtein distance 15 : * 16 : * Return SIZE_T_MAX in case of error. 17 : */ 18 145006 : size_t CPLLevenshteinDistance(const char *word1, const char *word2, 19 : bool transpositionAllowed) 20 : { 21 145006 : const size_t size1 = strlen(word1); 22 145006 : const size_t size2 = strlen(word2); 23 : 24 : // If one of the words has zero length, the distance is equal to the size of the other word. 25 145006 : if (size1 == 0) 26 1 : return size2; 27 145005 : if (size2 == 0) 28 1 : return size1; 29 : 30 : // Would cause enormous amount of memory to be allocated 31 145004 : if (size1 >= 32768 || size2 >= 32768) 32 : { 33 2 : return strcmp(word1, word2) == 0 ? 0 34 2 : : std::numeric_limits<size_t>::max(); 35 : } 36 : 37 : // Verification matrix i.e. 2D array which will store the calculated distance. 38 145002 : const size_t dimFastSize = size2 + 1; 39 290004 : std::vector<unsigned short> verif; 40 : try 41 : { 42 145002 : verif.resize((size1 + 1) * dimFastSize); 43 : } 44 0 : catch (const std::exception &) 45 : { 46 0 : return std::numeric_limits<size_t>::max(); 47 : } 48 : 49 33331200 : const auto verifVal = [&verif, dimFastSize](size_t i, 50 33331200 : size_t j) -> unsigned short & 51 33331200 : { return verif[i * dimFastSize + j]; }; 52 : 53 : // Sets the first row and the first column of the verification matrix with the numerical order from 0 to the length of each word. 54 1309930 : for (size_t i = 0; i <= size1; i++) 55 1164930 : verifVal(i, 0) = static_cast<unsigned short>(i); 56 1381220 : for (size_t j = 0; j <= size2; j++) 57 1236220 : verifVal(0, j) = static_cast<unsigned short>(j); 58 : 59 : // Verification step / matrix filling. 60 1164930 : for (size_t i = 1; i <= size1; i++) 61 : { 62 8695810 : for (size_t j = 1; j <= size2; j++) 63 : { 64 : // Sets the modification cost. 65 : // 0 means no modification (i.e. equal letters) and 1 means that a modification is needed (i.e. unequal letters). 66 7675880 : const int cost = (word2[j - 1] == word1[i - 1]) ? 0 : 1; 67 : 68 : // Sets the current position of the matrix as the minimum value between a (deletion), b (insertion) and c (substitution). 69 : // a = the upper adjacent value plus 1: verif[i - 1][j] + 1 70 : // b = the left adjacent value plus 1: verif[i][j - 1] + 1 71 : // c = the upper left adjacent value plus the modification cost: verif[i - 1][j - 1] + cost 72 7675880 : verifVal(i, j) = std::min( 73 23027600 : std::min(static_cast<unsigned short>(verifVal(i - 1, j) + 1), 74 7675880 : static_cast<unsigned short>(verifVal(i, j - 1) + 1)), 75 15351800 : static_cast<unsigned short>(verifVal(i - 1, j - 1) + cost)); 76 : 77 : // Cf https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance 78 : // (note that in the Wikipedia page, a[] and b[] are indexed from 1... 79 7675880 : if (transpositionAllowed && i > 1 && j > 1 && 80 5709710 : word1[i - 1] == word2[j - 2] && word1[i - 2] == word2[j - 1]) 81 : { 82 27184 : verifVal(i, j) = std::min( 83 27184 : verifVal(i, j), 84 54368 : static_cast<unsigned short>(verifVal(i - 2, j - 2) + 1)); 85 : } 86 : } 87 : } 88 : 89 : // The last position of the matrix will contain the Levenshtein distance. 90 145002 : return verifVal(size1, size2); 91 : }