/* $Id: Metrics.cxx 15828 2013-03-28 11:55:53Z sloot $ $URL: https://ilk.uvt.nl/svn/trunk/sources/Timbl6/src/Metrics.cxx $ Copyright (c) 1998 - 2013 ILK - Tilburg University CLiPS - University of Antwerp This file is part of timbl timbl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. timbl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . For questions and suggestions, see: http://ilk.uvt.nl/software.html or send mail to: timbl@uvt.nl */ #include #include #include #include #include #include #include #include #include "timbl/Common.h" #include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" #include "timbl/Metrics.h" using namespace std; using namespace TiCC; using Common::Epsilon; using Common::Log2; //#define METRIC_DEBUG namespace Timbl{ const double maxSimilarity = std::numeric_limits::max(); double lv_distance( const string& source, const string& target ){ // code taken from: http://www.merriampark.com/ldcpp.htm // Levenshtein Distance Algorithm: C++ Implementation // by Anders Sewerin Johansen // Step 1 const size_t n = source.length(); const size_t m = target.length(); if (n == 0) { return (double)m; } if (m == 0) { return (double)n; } // Good form to declare a TYPEDEF typedef std::vector< std::vector > Tmatrix; Tmatrix matrix(n+1); // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't // allow for allocation on declaration of 2.nd dimension of vec of vec for ( size_t i = 0; i <= n; ++i ) { matrix[i].resize(m+1); } // Step 2 for ( size_t i = 0; i <= n; ++i ) { matrix[i][0]=i; } for ( size_t i = 0; i <= m; ++i ) { matrix[0][i]=i; } // Step 3 for ( size_t i = 1; i <= n; ++i ) { const char s_i = source[i-1]; // Step 4 for ( size_t j = 1; j <= m; ++j ) { const char t_j = target[j-1]; // Step 5 int cost; if (s_i == t_j) { cost = 0; } else { cost = 1; } // Step 6 const size_t above = matrix[i-1][j]; const size_t left = matrix[i][j-1]; const size_t diag = matrix[i-1][j-1]; size_t cell = min( above + 1, min(left + 1, diag + cost)); // Step 6A: Cover transposition, in addition to deletion, // insertion and substitution. This step is taken from: // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's // Enhanced Dynamic Programming ASM Algorithm" // (http://www.acm.org/~hlb/publications/asm/asm.html) if (i>2 && j>2) { size_t trans=matrix[i-2][j-2]+1; if (source[i-2]!=t_j) trans++; if (s_i!=target[j-2]) trans++; if (cell>trans) cell=trans; } matrix[i][j]=cell; } } return (double)matrix[n][m]; } double dc_distance( const string& string1, const string& string2 ){ // code taken from: // http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Dice's_coefficient unsigned int ls1 = string1.length(); unsigned int ls2 = string2.length(); double dice; int overlap = 0; int total = 0; if ( ls1 <= 1 || ls2 <= 1 ){ // back-off naar unigrammen set string1_unigrams; set string2_unigrams; for(unsigned int i = 0; i < ls1; i++) { string1_unigrams.insert(string1[i]); } for(unsigned int i = 0; i < ls2; i++) { string2_unigrams.insert(string2[i]); } set::const_iterator it = string2_unigrams.begin(); while ( it != string2_unigrams.end() ){ if ( string1_unigrams.find( *it ) != string1_unigrams.end() ) ++overlap; ++it; } total = string1_unigrams.size() + string2_unigrams.size(); } else { set string1_bigrams; set string2_bigrams; for(unsigned int i = 0; i < (ls1 - 1); i++) { // extract character bigrams from string1 string1_bigrams.insert(string1.substr(i, 2)); } for(unsigned int i = 0; i < (ls2 - 1); i++) { // extract character bigrams from string2 string2_bigrams.insert(string2.substr(i, 2)); } set::const_iterator it = string2_bigrams.begin(); while ( it != string2_bigrams.end() ){ if ( string1_bigrams.find( *it ) != string1_bigrams.end() ) ++overlap; ++ it; } total = string1_bigrams.size() + string2_bigrams.size(); } dice = (double)(overlap * 2) / (double)total; // we will return 1 - dice coefficient ad distance dice = 1.0 - dice; return dice; } double vd_distance( SparseValueProbClass *r, SparseValueProbClass *s ){ double result = 0.0; if ( ! ( r && s ) ) return 1.0; SparseValueProbClass::IDiterator p1 = r->begin(); SparseValueProbClass::IDiterator p2 = s->begin(); while( p1 != r->end() && p2 != s->end() ){ if ( p2->first < p1->first ){ result += p2->second; ++p2; } else if ( p2->first == p1->first ){ result += fabs( p1->second - p2->second ); ++p1; ++p2; } else { result += p1->second; ++p1; } } while ( p1 != r->end() ){ result += p1->second; ++p1; } while ( p2 != s->end() ){ result += p2->second; ++p2; } result = result / 2.0; return result; } double p_log_p_div_q( double p, double q ) { if ( abs(q) < Epsilon ) return 0; return p * Log2( p/q ); } double jd_distance( SparseValueProbClass *r, SparseValueProbClass *s ){ double part1 = 0.0; double part2 = 0.0; SparseValueProbClass::IDiterator p1 = r->begin(); SparseValueProbClass::IDiterator p2 = s->begin(); while( p1 != r->end() && p2 != s->end() ){ if ( p2->first < p1->first ){ part2 += p2->second; ++p2; } else if ( p2->first == p1->first ){ part1 += p_log_p_div_q( p1->second, p2->second ); part2 += p_log_p_div_q( p2->second, p1->second ); ++p1; ++p2; } else { part1 += p1->second; ++p1; } } while ( p1 != r->end() ){ part1 += p1->second; ++p1; } while ( p2 != s->end() ){ part2 += p2->second; ++p2; } double result = part1 + part2; result = result / 2.0; return result; } double k_log_k_div_m( double k, double l ) { if ( abs(k+l) < Epsilon ) return 0; return k * Log2( (2.0 * k)/( k + l ) ); } double js_distance( SparseValueProbClass *r, SparseValueProbClass *s ){ double part1 = 0.0; double part2 = 0.0; SparseValueProbClass::IDiterator p1 = r->begin(); SparseValueProbClass::IDiterator p2 = s->begin(); while( p1 != r->end() && p2 != s->end() ){ if ( p2->first < p1->first ){ part2 += p2->second; ++p2; } else if ( p2->first == p1->first ){ part1 += k_log_k_div_m( p1->second, p2->second ); part2 += k_log_k_div_m( p2->second, p1->second ); ++p1; ++p2; } else { part1 += p1->second; ++p1; } } while ( p1 != r->end() ){ part1 += p1->second; ++p1; } while ( p2 != s->end() ){ part2 += p2->second; ++p2; } double result = part1 + part2; result = result / 2.0; return result; } metricClass *getMetricClass( MetricType mt ){ switch ( mt ){ case Overlap: return new OverlapMetric(); break; case Numeric: return new NumericMetric(); break; case Euclidean: return new EuclideanMetric(); break; case Cosine: return new CosineMetric(); break; case DotProduct: return new DotProductMetric(); break; case ValueDiff: return new ValueDiffMetric(); break; case JeffreyDiv: return new JeffreyMetric(); break; case JSDiv: return new JSMetric(); break; case Levenshtein: return new LevenshteinMetric(); break; case Dice: return new DiceMetric(); break; case Ignore: return 0; break; default: throw logic_error("getMetricClass: unknown MetricType " + toString(mt) ); } } double OverlapMetric::distance( FeatureValue *F, FeatureValue *G, size_t, double ) const { if ( F == G ) return 0.0; else return 1.0; } inline bool FV_to_real( FeatureValue *FV, double &result ){ if ( FV ){ if ( stringTo( FV->Name(), result ) ) return true; } return false; } double JeffreyMetric::distance( FeatureValue *F, FeatureValue *G, size_t limit, double ) const { double result = 0.0; if ( G != F ){ if ( F->ValFreq() < limit || G->ValFreq() < limit ){ #ifdef METRIC_DEBUG cerr << "result = 1.0 vanwege F.valFreq=" << F->ValFreq() << " en G.valFreq()=" << G ->ValFreq() << " met limiet= " << limit << endl; #endif result = 1.0; } else { result = jd_distance( F->valueClassProb(), G->valueClassProb() ); } } return result; } double JSMetric::distance( FeatureValue *F, FeatureValue *G, size_t limit, double ) const { double result = 0.0; if ( G != F ){ if ( F->ValFreq() < limit || G->ValFreq() < limit ){ #ifdef METRIC_DEBUG cerr << "result = 1.0 vanwege F.valFreq=" << F->ValFreq() << " en G.valFreq()=" << G ->ValFreq() << " met limiet= " << limit << endl; #endif result = 1.0; } else { result = js_distance( F->valueClassProb(), G->valueClassProb() ); } } return result; } double LevenshteinMetric::distance( FeatureValue *F, FeatureValue *G, size_t, double) const { double result = 0.0; if ( G != F ){ result = lv_distance( F->Name(), G->Name() ); } return result; } double DiceMetric::distance( FeatureValue *F, FeatureValue *G, size_t, double ) const { double result = 0.0; if ( G != F ){ result = dc_distance( F->Name(), G->Name() ); } return result; } double ValueDiffMetric::distance( FeatureValue *F, FeatureValue *G, size_t limit, double ) const { double result = 0.0; if ( G != F ){ if ( F->ValFreq() < limit || G->ValFreq() < limit ){ #ifdef METRIC_DEBUG cerr << "result = 1.0 vanwege F.valFreq=" << F->ValFreq() << " en G.valFreq()=" << G ->ValFreq() << " met limiet= " << limit << endl; #endif result = 1.0; } else result = vd_distance( F->valueClassProb(), G->valueClassProb() ); } return result; } double NumericMetric::distance( FeatureValue *F, FeatureValue *G, size_t, double scale ) const { double r1, r2, result; if ( FV_to_real( F, r1 ) && FV_to_real( G, r2 ) ) result = fabs( (r1-r2)/ ( scale ) ); else result = 1.0; return result; } double EuclideanMetric::distance( FeatureValue *F, FeatureValue *G, size_t, double scale ) const { double r1, r2, result; if ( FV_to_real( F, r1 ) && FV_to_real( G, r2 ) ) result = sqrt(fabs(r1*r1-r2*r2))/ ( scale ); else result = 1.0; return result; } double DotProductMetric::distance( FeatureValue *, FeatureValue *, size_t, double ) const { throw( logic_error( "unimplemented distance() for Dotproduct metric!" ) ); return -1.0; } double CosineMetric::distance( FeatureValue *, FeatureValue *, size_t, double ) const { throw( logic_error( "unimplemented distance() for Cosine metric!" ) ); return -1.0; } }