RapidFuzz
Loading...
Searching...
No Matches
fuzz_impl.hpp
1/* SPDX-License-Identifier: MIT */
2/* Copyright © 2021-present Max Bachmann */
3/* Copyright © 2011 Adam Cohen */
4
5#include "rapidfuzz/details/Range.hpp"
6#include <limits>
7#include <rapidfuzz/details/CharSet.hpp>
8
9#include <algorithm>
10#include <cmath>
11#include <iterator>
12#include <sys/types.h>
13#include <vector>
14
15namespace rapidfuzz {
16namespace fuzz {
17
18/**********************************************
19 * ratio
20 *********************************************/
21
22template <typename InputIt1, typename InputIt2>
23double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
24{
25 return ratio(detail::make_range(first1, last1), detail::make_range(first2, last2), score_cutoff);
26}
27
28template <typename Sentence1, typename Sentence2>
29double ratio(const Sentence1& s1, const Sentence2& s2, const double score_cutoff)
30{
31 return indel_normalized_similarity(s1, s2, score_cutoff / 100) * 100;
32}
33
34template <typename CharT1>
35template <typename InputIt2>
36double CachedRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
37 double score_hint) const
38{
39 return similarity(detail::make_range(first2, last2), score_cutoff, score_hint);
40}
41
42template <typename CharT1>
43template <typename Sentence2>
44double CachedRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double score_hint) const
45{
46 return cached_indel.normalized_similarity(s2, score_cutoff / 100, score_hint / 100) * 100;
47}
48
49/**********************************************
50 * partial_ratio
51 *********************************************/
52
53namespace fuzz_detail {
54
55static RAPIDFUZZ_CONSTEXPR_CXX14 double norm_distance(size_t dist, size_t lensum, double score_cutoff = 0)
56{
57 double score =
58 (lensum > 0) ? (100.0 - 100.0 * static_cast<double>(dist) / static_cast<double>(lensum)) : 100.0;
59
60 return (score >= score_cutoff) ? score : 0;
61}
62
63static inline size_t score_cutoff_to_distance(double score_cutoff, size_t lensum)
64{
65 return static_cast<size_t>(std::ceil(static_cast<double>(lensum) * (1.0 - score_cutoff / 100)));
66}
67
68template <typename InputIt1, typename InputIt2, typename CachedCharT1>
69ScoreAlignment<double>
70partial_ratio_impl(const detail::Range<InputIt1>& s1, const detail::Range<InputIt2>& s2,
71 const CachedRatio<CachedCharT1>& cached_ratio,
72 const detail::CharSet<iter_value_t<InputIt1>>& s1_char_set, double score_cutoff)
73{
74 ScoreAlignment<double> res;
75 size_t len1 = s1.size();
76 size_t len2 = s2.size();
77 res.src_start = 0;
78 res.src_end = len1;
79 res.dest_start = 0;
80 res.dest_end = len1;
81
82 if (len2 > len1) {
83 size_t maximum = len1 * 2;
84 double norm_cutoff_sim = rapidfuzz::detail::NormSim_to_NormDist(score_cutoff / 100);
85 size_t cutoff_dist = static_cast<size_t>(std::ceil(static_cast<double>(maximum) * norm_cutoff_sim));
86 size_t best_dist = std::numeric_limits<size_t>::max();
87 std::vector<size_t> scores(len2 - len1, std::numeric_limits<size_t>::max());
88 std::vector<std::pair<size_t, size_t>> windows = {{0, len2 - len1 - 1}};
89 std::vector<std::pair<size_t, size_t>> new_windows;
90
91 while (!windows.empty()) {
92 for (const auto& window : windows) {
93 auto subseq1_first = s2.begin() + static_cast<ptrdiff_t>(window.first);
94 auto subseq2_first = s2.begin() + static_cast<ptrdiff_t>(window.second);
95 auto subseq1 =
96 detail::make_range(subseq1_first, subseq1_first + static_cast<ptrdiff_t>(len1));
97 auto subseq2 =
98 detail::make_range(subseq2_first, subseq2_first + static_cast<ptrdiff_t>(len1));
99
100 if (scores[window.first] == std::numeric_limits<size_t>::max()) {
101 scores[window.first] = cached_ratio.cached_indel.distance(subseq1);
102 if (scores[window.first] < cutoff_dist) {
103 cutoff_dist = best_dist = scores[window.first];
104 res.dest_start = window.first;
105 res.dest_end = window.first + len1;
106 if (best_dist == 0) {
107 res.score = 100;
108 return res;
109 }
110 }
111 }
112 if (scores[window.second] == std::numeric_limits<size_t>::max()) {
113 scores[window.second] = cached_ratio.cached_indel.distance(subseq2);
114 if (scores[window.second] < cutoff_dist) {
115 cutoff_dist = best_dist = scores[window.second];
116 res.dest_start = window.second;
117 res.dest_end = window.second + len1;
118 if (best_dist == 0) {
119 res.score = 100;
120 return res;
121 }
122 }
123 }
124
125 size_t cell_diff = window.second - window.first;
126 if (cell_diff == 1) continue;
127
128 /* find the minimum score possible in the range first <-> last */
129 size_t known_edits = detail::abs_diff(scores[window.first], scores[window.second]);
130 /* half of the cells that are not needed for known_edits can lead to a better score */
131 size_t max_score_improvement = (cell_diff - known_edits / 2) / 2 * 2;
132 ptrdiff_t min_score =
133 static_cast<ptrdiff_t>(std::min(scores[window.first], scores[window.second])) -
134 static_cast<ptrdiff_t>(max_score_improvement);
135 if (min_score < static_cast<ptrdiff_t>(cutoff_dist)) {
136 size_t center = cell_diff / 2;
137 new_windows.emplace_back(window.first, window.first + center);
138 new_windows.emplace_back(window.first + center, window.second);
139 }
140 }
141
142 std::swap(windows, new_windows);
143 new_windows.clear();
144 }
145
146 double score = 1.0 - (static_cast<double>(best_dist) / static_cast<double>(maximum));
147 score *= 100;
148 if (score >= score_cutoff) score_cutoff = res.score = score;
149 }
150
151 for (size_t i = 1; i < len1; ++i) {
152 auto subseq = rapidfuzz::detail::make_range(s2.begin(), s2.begin() + static_cast<ptrdiff_t>(i));
153 if (!s1_char_set.find(subseq.back())) continue;
154
155 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
156 if (ls_ratio > res.score) {
157 score_cutoff = res.score = ls_ratio;
158 res.dest_start = 0;
159 res.dest_end = i;
160 if (res.score == 100.0) return res;
161 }
162 }
163
164 for (size_t i = len2 - len1; i < len2; ++i) {
165 auto subseq = rapidfuzz::detail::make_range(s2.begin() + static_cast<ptrdiff_t>(i), s2.end());
166 if (!s1_char_set.find(subseq.front())) continue;
167
168 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
169 if (ls_ratio > res.score) {
170 score_cutoff = res.score = ls_ratio;
171 res.dest_start = i;
172 res.dest_end = len2;
173 if (res.score == 100.0) return res;
174 }
175 }
176
177 return res;
178}
179
180template <typename InputIt1, typename InputIt2, typename CharT1 = iter_value_t<InputIt1>>
181ScoreAlignment<double> partial_ratio_impl(const detail::Range<InputIt1>& s1,
182 const detail::Range<InputIt2>& s2, double score_cutoff)
183{
184 CachedRatio<CharT1> cached_ratio(s1);
185
186 detail::CharSet<CharT1> s1_char_set;
187 for (auto ch : s1)
188 s1_char_set.insert(ch);
189
190 return partial_ratio_impl(s1, s2, cached_ratio, s1_char_set, score_cutoff);
191}
192
193} // namespace fuzz_detail
194
195template <typename InputIt1, typename InputIt2>
196ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
197 InputIt2 last2, double score_cutoff)
198{
199 size_t len1 = static_cast<size_t>(std::distance(first1, last1));
200 size_t len2 = static_cast<size_t>(std::distance(first2, last2));
201
202 if (len1 > len2) {
203 ScoreAlignment<double> result = partial_ratio_alignment(first2, last2, first1, last1, score_cutoff);
204 std::swap(result.src_start, result.dest_start);
205 std::swap(result.src_end, result.dest_end);
206 return result;
207 }
208
209 if (score_cutoff > 100) return ScoreAlignment<double>(0, 0, len1, 0, len1);
210
211 if (!len1 || !len2)
212 return ScoreAlignment<double>(static_cast<double>(len1 == len2) * 100.0, 0, len1, 0, len1);
213
214 auto s1 = detail::make_range(first1, last1);
215 auto s2 = detail::make_range(first2, last2);
216
217 auto alignment = fuzz_detail::partial_ratio_impl(s1, s2, score_cutoff);
218 if (alignment.score != 100 && s1.size() == s2.size()) {
219 score_cutoff = std::max(score_cutoff, alignment.score);
220 auto alignment2 = fuzz_detail::partial_ratio_impl(s2, s1, score_cutoff);
221 if (alignment2.score > alignment.score) {
222 std::swap(alignment2.src_start, alignment2.dest_start);
223 std::swap(alignment2.src_end, alignment2.dest_end);
224 return alignment2;
225 }
226 }
227
228 return alignment;
229}
230
231template <typename Sentence1, typename Sentence2>
232ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
233{
234 return partial_ratio_alignment(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
235 detail::to_end(s2), score_cutoff);
236}
237
238template <typename InputIt1, typename InputIt2>
239double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
240{
241 return partial_ratio_alignment(first1, last1, first2, last2, score_cutoff).score;
242}
243
244template <typename Sentence1, typename Sentence2>
245double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
246{
247 return partial_ratio_alignment(s1, s2, score_cutoff).score;
248}
249
250template <typename CharT1>
251template <typename InputIt1>
252CachedPartialRatio<CharT1>::CachedPartialRatio(InputIt1 first1, InputIt1 last1)
253 : s1(first1, last1), cached_ratio(first1, last1)
254{
255 for (const auto& ch : s1)
256 s1_char_set.insert(ch);
257}
258
259template <typename CharT1>
260template <typename InputIt2>
261double CachedPartialRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
262 double) const
263{
264 size_t len1 = s1.size();
265 size_t len2 = static_cast<size_t>(std::distance(first2, last2));
266
267 if (len1 > len2)
268 return partial_ratio(detail::to_begin(s1), detail::to_end(s1), first2, last2, score_cutoff);
269
270 if (score_cutoff > 100) return 0;
271
272 if (!len1 || !len2) return static_cast<double>(len1 == len2) * 100.0;
273
274 auto s1_ = detail::make_range(s1);
275 auto s2 = detail::make_range(first2, last2);
276
277 double score = fuzz_detail::partial_ratio_impl(s1_, s2, cached_ratio, s1_char_set, score_cutoff).score;
278 if (score != 100 && s1_.size() == s2.size()) {
279 score_cutoff = std::max(score_cutoff, score);
280 double score2 = fuzz_detail::partial_ratio_impl(s2, s1_, score_cutoff).score;
281 if (score2 > score) return score2;
282 }
283
284 return score;
285}
286
287template <typename CharT1>
288template <typename Sentence2>
289double CachedPartialRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
290{
291 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
292}
293
294/**********************************************
295 * token_sort_ratio
296 *********************************************/
297template <typename InputIt1, typename InputIt2>
298double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
299{
300 if (score_cutoff > 100) return 0;
301
302 return ratio(detail::sorted_split(first1, last1).join(), detail::sorted_split(first2, last2).join(),
303 score_cutoff);
304}
305
306template <typename Sentence1, typename Sentence2>
307double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
308{
309 return token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
310 detail::to_end(s2), score_cutoff);
311}
312
313template <typename CharT1>
314template <typename InputIt2>
315double CachedTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
316 double) const
317{
318 if (score_cutoff > 100) return 0;
319
320 return cached_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
321}
322
323template <typename CharT1>
324template <typename Sentence2>
325double CachedTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
326{
327 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
328}
329
330/**********************************************
331 * partial_token_sort_ratio
332 *********************************************/
333
334template <typename InputIt1, typename InputIt2>
335double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
336 double score_cutoff)
337{
338 if (score_cutoff > 100) return 0;
339
340 return partial_ratio(detail::sorted_split(first1, last1).join(),
341 detail::sorted_split(first2, last2).join(), score_cutoff);
342}
343
344template <typename Sentence1, typename Sentence2>
345double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
346{
347 return partial_token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
348 detail::to_end(s2), score_cutoff);
349}
350
351template <typename CharT1>
352template <typename InputIt2>
353double CachedPartialTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
354 double) const
355{
356 if (score_cutoff > 100) return 0;
357
358 return cached_partial_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
359}
360
361template <typename CharT1>
362template <typename Sentence2>
363double CachedPartialTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
364{
365 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
366}
367
368/**********************************************
369 * token_set_ratio
370 *********************************************/
371
372namespace fuzz_detail {
373template <typename InputIt1, typename InputIt2>
374double token_set_ratio(const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
375 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
376 const double score_cutoff)
377{
378 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
379 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
380 if (tokens_a.empty() || tokens_b.empty()) return 0;
381
382 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
383 auto intersect = decomposition.intersection;
384 auto diff_ab = decomposition.difference_ab;
385 auto diff_ba = decomposition.difference_ba;
386
387 // one sentence is part of the other one
388 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
389
390 auto diff_ab_joined = diff_ab.join();
391 auto diff_ba_joined = diff_ba.join();
392
393 size_t ab_len = diff_ab_joined.size();
394 size_t ba_len = diff_ba_joined.size();
395 size_t sect_len = intersect.length();
396
397 // string length sect+ab <-> sect and sect+ba <-> sect
398 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
399 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
400
401 double result = 0;
402 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
403 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
404
405 if (dist <= cutoff_distance) result = norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
406
407 // exit early since the other ratios are 0
408 if (!sect_len) return result;
409
410 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
411 // since only sect is similar in them the distance can be calculated based on
412 // the length difference
413 size_t sect_ab_dist = bool(sect_len) + ab_len;
414 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
415
416 size_t sect_ba_dist = bool(sect_len) + ba_len;
417 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
418
419 return std::max({result, sect_ab_ratio, sect_ba_ratio});
420}
421} // namespace fuzz_detail
422
423template <typename InputIt1, typename InputIt2>
424double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
425{
426 if (score_cutoff > 100) return 0;
427
428 return fuzz_detail::token_set_ratio(detail::sorted_split(first1, last1),
429 detail::sorted_split(first2, last2), score_cutoff);
430}
431
432template <typename Sentence1, typename Sentence2>
433double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
434{
435 return token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
436 score_cutoff);
437}
438
439template <typename CharT1>
440template <typename InputIt2>
441double CachedTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
442 double) const
443{
444 if (score_cutoff > 100) return 0;
445
446 return fuzz_detail::token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
447}
448
449template <typename CharT1>
450template <typename Sentence2>
451double CachedTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
452{
453 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
454}
455
456/**********************************************
457 * partial_token_set_ratio
458 *********************************************/
459
460namespace fuzz_detail {
461template <typename InputIt1, typename InputIt2>
462double partial_token_set_ratio(const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
463 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
464 const double score_cutoff)
465{
466 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
467 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
468 if (tokens_a.empty() || tokens_b.empty()) return 0;
469
470 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
471
472 // exit early when there is a common word in both sequences
473 if (!decomposition.intersection.empty()) return 100;
474
475 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
476 score_cutoff);
477}
478} // namespace fuzz_detail
479
480template <typename InputIt1, typename InputIt2>
481double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
482 double score_cutoff)
483{
484 if (score_cutoff > 100) return 0;
485
486 return fuzz_detail::partial_token_set_ratio(detail::sorted_split(first1, last1),
487 detail::sorted_split(first2, last2), score_cutoff);
488}
489
490template <typename Sentence1, typename Sentence2>
491double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
492{
493 return partial_token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
494 detail::to_end(s2), score_cutoff);
495}
496
497template <typename CharT1>
498template <typename InputIt2>
499double CachedPartialTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
500 double) const
501{
502 if (score_cutoff > 100) return 0;
503
504 return fuzz_detail::partial_token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
505}
506
507template <typename CharT1>
508template <typename Sentence2>
509double CachedPartialTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
510{
511 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
512}
513
514/**********************************************
515 * token_ratio
516 *********************************************/
517
518template <typename InputIt1, typename InputIt2>
519double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
520{
521 if (score_cutoff > 100) return 0;
522
523 auto tokens_a = detail::sorted_split(first1, last1);
524 auto tokens_b = detail::sorted_split(first2, last2);
525
526 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
527 auto intersect = decomposition.intersection;
528 auto diff_ab = decomposition.difference_ab;
529 auto diff_ba = decomposition.difference_ba;
530
531 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
532
533 auto diff_ab_joined = diff_ab.join();
534 auto diff_ba_joined = diff_ba.join();
535
536 size_t ab_len = diff_ab_joined.size();
537 size_t ba_len = diff_ba_joined.size();
538 size_t sect_len = intersect.length();
539
540 double result = ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
541
542 // string length sect+ab <-> sect and sect+ba <-> sect
543 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
544 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
545
546 size_t cutoff_distance = fuzz_detail::score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
547 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
548 if (dist <= cutoff_distance)
549 result = std::max(result, fuzz_detail::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
550
551 // exit early since the other ratios are 0
552 if (!sect_len) return result;
553
554 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
555 // since only sect is similar in them the distance can be calculated based on
556 // the length difference
557 size_t sect_ab_dist = bool(sect_len) + ab_len;
558 double sect_ab_ratio = fuzz_detail::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
559
560 size_t sect_ba_dist = bool(sect_len) + ba_len;
561 double sect_ba_ratio = fuzz_detail::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
562
563 return std::max({result, sect_ab_ratio, sect_ba_ratio});
564}
565
566template <typename Sentence1, typename Sentence2>
567double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
568{
569 return token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
570 score_cutoff);
571}
572
573namespace fuzz_detail {
574template <typename CharT1, typename CachedCharT1, typename InputIt2>
575double token_ratio(const rapidfuzz::detail::SplittedSentenceView<CharT1>& s1_tokens,
576 const CachedRatio<CachedCharT1>& cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2,
577 double score_cutoff)
578{
579 if (score_cutoff > 100) return 0;
580
581 auto s2_tokens = detail::sorted_split(first2, last2);
582
583 auto decomposition = detail::set_decomposition(s1_tokens, s2_tokens);
584 auto intersect = decomposition.intersection;
585 auto diff_ab = decomposition.difference_ab;
586 auto diff_ba = decomposition.difference_ba;
587
588 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
589
590 auto diff_ab_joined = diff_ab.join();
591 auto diff_ba_joined = diff_ba.join();
592
593 size_t ab_len = diff_ab_joined.size();
594 size_t ba_len = diff_ba_joined.size();
595 size_t sect_len = intersect.length();
596
597 double result = cached_ratio_s1_sorted.similarity(s2_tokens.join(), score_cutoff);
598
599 // string length sect+ab <-> sect and sect+ba <-> sect
600 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
601 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
602
603 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
604 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
605 if (dist <= cutoff_distance)
606 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
607
608 // exit early since the other ratios are 0
609 if (!sect_len) return result;
610
611 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
612 // since only sect is similar in them the distance can be calculated based on
613 // the length difference
614 size_t sect_ab_dist = bool(sect_len) + ab_len;
615 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
616
617 size_t sect_ba_dist = bool(sect_len) + ba_len;
618 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
619
620 return std::max({result, sect_ab_ratio, sect_ba_ratio});
621}
622
623// todo this is a temporary solution until WRatio is properly implemented using other scorers
624template <typename CharT1, typename InputIt1, typename InputIt2>
625double token_ratio(const std::vector<CharT1>& s1_sorted,
626 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
627 const detail::BlockPatternMatchVector& blockmap_s1_sorted, InputIt2 first2, InputIt2 last2,
628 double score_cutoff)
629{
630 if (score_cutoff > 100) return 0;
631
632 auto tokens_b = detail::sorted_split(first2, last2);
633
634 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
635 auto intersect = decomposition.intersection;
636 auto diff_ab = decomposition.difference_ab;
637 auto diff_ba = decomposition.difference_ba;
638
639 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
640
641 auto diff_ab_joined = diff_ab.join();
642 auto diff_ba_joined = diff_ba.join();
643
644 size_t ab_len = diff_ab_joined.size();
645 size_t ba_len = diff_ba_joined.size();
646 size_t sect_len = intersect.length();
647
648 double result = 0;
649 auto s2_sorted = tokens_b.join();
650 if (s1_sorted.size() < 65) {
651 double norm_sim =
652 detail::indel_normalized_similarity(blockmap_s1_sorted, detail::make_range(s1_sorted),
653 detail::make_range(s2_sorted), score_cutoff / 100);
654 result = norm_sim * 100;
655 }
656 else {
657 result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
658 }
659
660 // string length sect+ab <-> sect and sect+ba <-> sect
661 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
662 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
663
664 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
665 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
666 if (dist <= cutoff_distance)
667 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
668
669 // exit early since the other ratios are 0
670 if (!sect_len) return result;
671
672 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
673 // since only sect is similar in them the distance can be calculated based on
674 // the length difference
675 size_t sect_ab_dist = bool(sect_len) + ab_len;
676 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
677
678 size_t sect_ba_dist = bool(sect_len) + ba_len;
679 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
680
681 return std::max({result, sect_ab_ratio, sect_ba_ratio});
682}
683} // namespace fuzz_detail
684
685template <typename CharT1>
686template <typename InputIt2>
687double CachedTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
688 double) const
689{
690 return fuzz_detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, first2, last2, score_cutoff);
691}
692
693template <typename CharT1>
694template <typename Sentence2>
695double CachedTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
696{
697 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
698}
699
700/**********************************************
701 * partial_token_ratio
702 *********************************************/
703
704template <typename InputIt1, typename InputIt2>
705double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
706 double score_cutoff)
707{
708 if (score_cutoff > 100) return 0;
709
710 auto tokens_a = detail::sorted_split(first1, last1);
711 auto tokens_b = detail::sorted_split(first2, last2);
712
713 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
714
715 // exit early when there is a common word in both sequences
716 if (!decomposition.intersection.empty()) return 100;
717
718 auto diff_ab = decomposition.difference_ab;
719 auto diff_ba = decomposition.difference_ba;
720
721 double result = partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
722
723 // do not calculate the same partial_ratio twice
724 if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
725 return result;
726 }
727
728 score_cutoff = std::max(score_cutoff, result);
729 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
730}
731
732template <typename Sentence1, typename Sentence2>
733double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
734{
735 return partial_token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
736 detail::to_end(s2), score_cutoff);
737}
738
739namespace fuzz_detail {
740template <typename CharT1, typename InputIt1, typename InputIt2>
741double partial_token_ratio(const std::vector<CharT1>& s1_sorted,
742 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
743 InputIt2 first2, InputIt2 last2, double score_cutoff)
744{
745 if (score_cutoff > 100) return 0;
746
747 auto tokens_b = detail::sorted_split(first2, last2);
748
749 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
750
751 // exit early when there is a common word in both sequences
752 if (!decomposition.intersection.empty()) return 100;
753
754 auto diff_ab = decomposition.difference_ab;
755 auto diff_ba = decomposition.difference_ba;
756
757 double result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
758
759 // do not calculate the same partial_ratio twice
760 if (tokens_s1.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
761 return result;
762 }
763
764 score_cutoff = std::max(score_cutoff, result);
765 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
766}
767
768} // namespace fuzz_detail
769
770template <typename CharT1>
771template <typename InputIt2>
772double CachedPartialTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
773 double) const
774{
775 return fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
776}
777
778template <typename CharT1>
779template <typename Sentence2>
780double CachedPartialTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
781{
782 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
783}
784
785/**********************************************
786 * WRatio
787 *********************************************/
788
789template <typename InputIt1, typename InputIt2>
790double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
791{
792 if (score_cutoff > 100) return 0;
793
794 constexpr double UNBASE_SCALE = 0.95;
795
796 auto len1 = std::distance(first1, last1);
797 auto len2 = std::distance(first2, last2);
798
799 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
800 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
801 if (!len1 || !len2) return 0;
802
803 double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
804 : static_cast<double>(len2) / static_cast<double>(len1);
805
806 double end_ratio = ratio(first1, last1, first2, last2, score_cutoff);
807
808 if (len_ratio < 1.5) {
809 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
810 return std::max(end_ratio, token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
811 }
812
813 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
814
815 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
816 end_ratio =
817 std::max(end_ratio, partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
818
819 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
820 return std::max(end_ratio, partial_token_ratio(first1, last1, first2, last2, score_cutoff) *
821 UNBASE_SCALE * PARTIAL_SCALE);
822}
823
824template <typename Sentence1, typename Sentence2>
825double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
826{
827 return WRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
828 score_cutoff);
829}
830
831template <typename Sentence1>
832template <typename InputIt1>
833CachedWRatio<Sentence1>::CachedWRatio(InputIt1 first1, InputIt1 last1)
834 : s1(first1, last1),
835 cached_partial_ratio(first1, last1),
836 tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
837 s1_sorted(tokens_s1.join()),
838 blockmap_s1_sorted(detail::make_range(s1_sorted))
839{}
840
841template <typename CharT1>
842template <typename InputIt2>
843double CachedWRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff, double) const
844{
845 if (score_cutoff > 100) return 0;
846
847 constexpr double UNBASE_SCALE = 0.95;
848
849 size_t len1 = s1.size();
850 size_t len2 = static_cast<size_t>(std::distance(first2, last2));
851
852 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
853 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
854 if (!len1 || !len2) return 0;
855
856 double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
857 : static_cast<double>(len2) / static_cast<double>(len1);
858
859 double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
860
861 if (len_ratio < 1.5) {
862 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
863 // use pre calculated values
864 auto r =
865 fuzz_detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, first2, last2, score_cutoff);
866 return std::max(end_ratio, r * UNBASE_SCALE);
867 }
868
869 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
870
871 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
872 end_ratio =
873 std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
874
875 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
876 auto r = fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
877 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
878}
879
880template <typename CharT1>
881template <typename Sentence2>
882double CachedWRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
883{
884 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
885}
886
887/**********************************************
888 * QRatio
889 *********************************************/
890
891template <typename InputIt1, typename InputIt2>
892double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
893{
894 ptrdiff_t len1 = std::distance(first1, last1);
895 ptrdiff_t len2 = std::distance(first2, last2);
896
897 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
898 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
899 if (!len1 || !len2) return 0;
900
901 return ratio(first1, last1, first2, last2, score_cutoff);
902}
903
904template <typename Sentence1, typename Sentence2>
905double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
906{
907 return QRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
908 score_cutoff);
909}
910
911template <typename CharT1>
912template <typename InputIt2>
913double CachedQRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff, double) const
914{
915 auto len2 = std::distance(first2, last2);
916
917 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
918 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
919 if (s1.empty() || !len2) return 0;
920
921 return cached_ratio.similarity(first2, last2, score_cutoff);
922}
923
924template <typename CharT1>
925template <typename Sentence2>
926double CachedQRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double) const
927{
928 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
929}
930
931} // namespace fuzz
932} // namespace rapidfuzz
double WRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
Definition fuzz_impl.hpp:825
double ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates a simple ratio between two strings
Definition fuzz_impl.hpp:29
double QRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
Definition fuzz_impl.hpp:905
double partial_token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
Definition fuzz_impl.hpp:491
double token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
Definition fuzz_impl.hpp:567
double partial_token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
Definition fuzz_impl.hpp:733
double token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.
Definition fuzz_impl.hpp:433
double partial_token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
Definition fuzz_impl.hpp:345
double token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
Definition fuzz_impl.hpp:307
double partial_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
Definition fuzz_impl.hpp:245