/
FuzzyScore.java
143 lines (124 loc) · 5.22 KB
/
FuzzyScore.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.text.similarity;
import java.util.Locale;
/**
* A matching algorithm that is similar to the searching algorithms implemented in editors such
* as Sublime Text, TextMate, Atom and others.
*
* <p>
* One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
* indicates a higher similarity.
* </p>
*
* <p>
* This code has been adapted from Apache Commons Lang 3.3.
* </p>
*
* @since 1.0
*/
public class FuzzyScore {
/**
* Locale used to change the case of text.
*/
private final Locale locale;
/**
* This returns a {@link Locale}-specific {@link FuzzyScore}.
*
* @param locale The string matching logic is case insensitive.
A {@link Locale} is necessary to normalize both Strings to lower case.
* @throws IllegalArgumentException
* This is thrown if the {@link Locale} parameter is {@code null}.
*/
public FuzzyScore(final Locale locale) {
if (locale == null) {
throw new IllegalArgumentException("Locale must not be null");
}
this.locale = locale;
}
/**
* Find the Fuzzy Score which indicates the similarity score between two
* Strings.
*
* <pre>
* score.fuzzyScore(null, null) = IllegalArgumentException
* score.fuzzyScore("not null", null) = IllegalArgumentException
* score.fuzzyScore(null, "not null") = IllegalArgumentException
* score.fuzzyScore("", "") = 0
* score.fuzzyScore("Workshop", "b") = 0
* score.fuzzyScore("Room", "o") = 1
* score.fuzzyScore("Workshop", "w") = 1
* score.fuzzyScore("Workshop", "ws") = 2
* score.fuzzyScore("Workshop", "wo") = 4
* score.fuzzyScore("Apache Software Foundation", "asf") = 3
* </pre>
*
* @param term a full term that should be matched against, must not be null
* @param query the query that will be matched against a term, must not be
* null
* @return result score
* @throws IllegalArgumentException if the term or query is {@code null}
*/
public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
if (term == null || query == null) {
throw new IllegalArgumentException("CharSequences must not be null");
}
// fuzzy logic is case insensitive. We normalize the Strings to lower
// case right from the start. Turning characters to lower case
// via Character.toLowerCase(char) is unfortunately insufficient
// as it does not accept a locale.
final String termLowerCase = term.toString().toLowerCase(locale);
final String queryLowerCase = query.toString().toLowerCase(locale);
// the resulting score
int score = 0;
// the position in the term which will be scanned next for potential
// query character matches
int termIndex = 0;
// index of the previously matched character in the term
int previousMatchingCharacterIndex = Integer.MIN_VALUE;
for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
final char queryChar = queryLowerCase.charAt(queryIndex);
boolean termCharacterMatchFound = false;
for (; termIndex < termLowerCase.length()
&& !termCharacterMatchFound; termIndex++) {
final char termChar = termLowerCase.charAt(termIndex);
if (queryChar == termChar) {
// simple character matches result in one point
score++;
// subsequent character matches further improve
// the score.
if (previousMatchingCharacterIndex + 1 == termIndex) {
score += 2;
}
previousMatchingCharacterIndex = termIndex;
// we can leave the nested loop. Every character in the
// query can match at most one character in the term.
termCharacterMatchFound = true;
}
}
}
return score;
}
/**
* Gets the locale.
*
* @return The locale
*/
public Locale getLocale() {
return locale;
}
}