fuzzy search - Use gems with c-extensions in JRuby, or port them to Java? -
i'm doing fuzzy match test between input string , entered strings. test performed live while typing.
i have shockingly accurate algorithm in place called strikeamatch, has been translated many languages. fastest ruby implementation i've found amatch. however, incompatible jruby environment because crunches data in c extension requires c interpreter ruby (mri). it's pretty fast though:
a = "lorem ipsum dolor" b = "lorem ipsum dolor sit amet, consectetuer adipiscing elit. nam cursus. morbi ut mi. nullam enim leo, egestas id, condimentum at, laoreet mattis, massa. sed eleifend nonummy diam. praesent mauris ante," puts benchmark.measure { 10000.times { a.pair_distance_similar(b) } } # => 0.130000 0.000000 0.130000 ( 0.146347)
i hope can avoid setting alternative environment. alternative approach try , port original java code as suggested in jruby wiki. not sure how though.
any ideas how approach this?
the algorithm easy implement. example, here's quick implementation wrote in java:
import java.util.hashmap; import java.util.hashset; import java.util.map; import java.util.set; public class strikeamatch { protected int numpairs(final string string) { return math.max(0, string.length() - 1); } protected map<string, integer> pairs(final string string) { if (string.isempty()) { return new hashmap<>(0); } final map<string, integer> pairs = new hashmap(2 * numpairs(string)); (int = 1, j = string.length(); < j; += 1) { final char = string.charat(i - 1); final char b = string.charat(i); final string pair = string.format("%c%c", a, b); if (pairs.containskey(pair)) { pairs.put(pair, 1 + pairs.get(pair)); } else { pairs.put(pair, 1); } } return pairs; } protected int intersectionsize( final map<string, integer> lhspairs, final map<string, integer> rhspairs) { final set<string> lhskeys = lhspairs.keyset(); final set<string> rhskeys = rhspairs.keyset(); final set<string> pairintersection = new hashset<>(lhskeys); pairintersection.retainall(rhskeys); int sharedpairs = 0; (final string pair : pairintersection) { sharedpairs += math.min(lhspairs.get(pair), rhspairs.get(pair)); } return sharedpairs; } public double between(final string lhs, final string rhs) { if (lhs.isempty() && rhs.isempty()) { return 1.0; } final map<string, integer> lhspairs = pairs(lhs.touppercase()); final map<string, integer> rhspairs = pairs(rhs.touppercase()); return (2.0 * intersectionsize(lhspairs, rhspairs)) / (numpairs(lhs) + numpairs(rhs)); } public static void main(final string[] args) { final strikeamatch distance = new strikeamatch(); (final string lhs : args) { (final string rhs : args) { system.out.printf("d(\"%s\", \"%s\") = [%.7f]%n", lhs, rhs, distance.between(lhs, rhs)); } } } }
you can pad first , last characters, extend one-character or zero-character terms:
import java.util.hashmap; import java.util.map; public class paddedstrikeamatch extends strikeamatch { protected int numpairs(final string string) { return string.length() + 1; } protected map<string, integer> pairs(final string string) { if (string.isempty()) { final map<string, integer> pairs = new hashmap<>(1); pairs.put(" ", 1); return pairs; } final map<string, integer> pairs = new hashmap(2 * numpairs(string)); pairs.put(string.format(" %c", string.charat(0)), 1); pairs.put(string.format("%c ", string.charat(string.length() - 1)), 1); (int = 1, j = string.length(); < j; += 1) { final char = string.charat(i - 1); final char b = string.charat(i); final string pair = string.format("%c%c", a, b); if (pairs.containskey(pair)) { pairs.put(pair, 1 + pairs.get(pair)); } else { pairs.put(pair, 1); } } return pairs; } public static void main(final string[] args) { final strikeamatch distance = new paddedstrikeamatch(); (final string lhs : args) { (final string rhs : args) { system.out.printf("d(\"%s\", \"%s\") = [%.7f]%n", lhs, rhs, distance.between(lhs, rhs)); } } } }
to validate it, here examples proposed in link provided:
% java strikeamatch france french d("france", "france") = [1.0000000] d("france", "french") = [0.4000000] d("french", "france") = [0.4000000] d("french", "french") = [1.0000000] % java strikeamatch healed sealed healthy heard herded sold d("healed", "healed") = [1.0000000] d("healed", "sealed") = [0.8000000] d("healed", "healthy") = [0.5454545] d("healed", "heard") = [0.4444444] d("healed", "herded") = [0.4000000] d("healed", "help") = [0.2500000] d("healed", "sold") = [0.0000000] d("sealed", "healed") = [0.8000000] d("sealed", "sealed") = [1.0000000] d("sealed", "healthy") = [0.3636364] d("sealed", "heard") = [0.2222222] d("sealed", "herded") = [0.2000000] d("sealed", "help") = [0.0000000] d("sealed", "sold") = [0.0000000] d("healthy", "healed") = [0.5454545] d("healthy", "sealed") = [0.3636364] d("healthy", "healthy") = [1.0000000] d("healthy", "heard") = [0.4000000] d("healthy", "herded") = [0.1818182] d("healthy", "help") = [0.2222222] d("healthy", "sold") = [0.0000000] d("heard", "healed") = [0.4444444] d("heard", "sealed") = [0.2222222] d("heard", "healthy") = [0.4000000] d("heard", "heard") = [1.0000000] d("heard", "herded") = [0.4444444] d("heard", "help") = [0.2857143] d("heard", "sold") = [0.0000000] d("herded", "healed") = [0.4000000] d("herded", "sealed") = [0.2000000] d("herded", "healthy") = [0.1818182] d("herded", "heard") = [0.4444444] d("herded", "herded") = [1.0000000] d("herded", "help") = [0.2500000] d("herded", "sold") = [0.0000000] d("help", "healed") = [0.2500000] d("help", "sealed") = [0.0000000] d("help", "healthy") = [0.2222222] d("help", "heard") = [0.2857143] d("help", "herded") = [0.2500000] d("help", "help") = [1.0000000] d("help", "sold") = [0.0000000] d("sold", "healed") = [0.0000000] d("sold", "sealed") = [0.0000000] d("sold", "healthy") = [0.0000000] d("sold", "heard") = [0.0000000] d("sold", "herded") = [0.0000000] d("sold", "help") = [0.0000000] d("sold", "sold") = [1.0000000]
... , here's padded version:
% java paddedstrikeamatch france french d("france", "france") = [1.0000000] d("france", "french") = [0.4285714] d("french", "france") = [0.4285714] d("french", "french") = [1.0000000] % java paddedstrikeamatch healed sealed healthy heard herded sold d("healed", "healed") = [1.0000000] d("healed", "sealed") = [0.7142857] d("healed", "healthy") = [0.5333333] d("healed", "heard") = [0.6153846] d("healed", "herded") = [0.5714286] d("healed", "help") = [0.3333333] d("healed", "sold") = [0.1666667] d("sealed", "healed") = [0.7142857] d("sealed", "sealed") = [1.0000000] d("sealed", "healthy") = [0.2666667] d("sealed", "heard") = [0.3076923] d("sealed", "herded") = [0.2857143] d("sealed", "help") = [0.0000000] d("sealed", "sold") = [0.3333333] d("healthy", "healed") = [0.5333333] d("healthy", "sealed") = [0.2666667] d("healthy", "healthy") = [1.0000000] d("healthy", "heard") = [0.4285714] d("healthy", "herded") = [0.2666667] d("healthy", "help") = [0.3076923] d("healthy", "sold") = [0.0000000] d("heard", "healed") = [0.6153846] d("heard", "sealed") = [0.3076923] d("heard", "healthy") = [0.4285714] d("heard", "heard") = [1.0000000] d("heard", "herded") = [0.6153846] d("heard", "help") = [0.3636364] d("heard", "sold") = [0.1818182] d("herded", "healed") = [0.5714286] d("herded", "sealed") = [0.2857143] d("herded", "healthy") = [0.2666667] d("herded", "heard") = [0.6153846] d("herded", "herded") = [1.0000000] d("herded", "help") = [0.3333333] d("herded", "sold") = [0.1666667] d("help", "healed") = [0.3333333] d("help", "sealed") = [0.0000000] d("help", "healthy") = [0.3076923] d("help", "heard") = [0.3636364] d("help", "herded") = [0.3333333] d("help", "help") = [1.0000000] d("help", "sold") = [0.0000000] d("sold", "healed") = [0.1666667] d("sold", "sealed") = [0.3333333] d("sold", "healthy") = [0.0000000] d("sold", "heard") = [0.1818182] d("sold", "herded") = [0.1666667] d("sold", "help") = [0.0000000] d("sold", "sold") = [1.0000000]
if want use directly jruby, need add strikeamatch.class
$classpath
, , within script require 'java'
followed java_import 'strikeamatch
:
require 'java' java_import 'strikeamatch' distance = strikeamatch.new argv.each |lhs| argv.each |rhs| puts "d(\"#{lhs}\", \"#{rhs}\") = [#{distance.between(lhs, rhs)}]" end end
to invoke it:
% jruby strike_a_match.rb france french d("france", "france") = [1.0] d("france", "french") = [0.4] d("french", "france") = [0.4] d("french", "french") = [1.0]
Comments
Post a Comment