/*
$Id: Metrics.cxx 15828 2013-03-28 11:55:53Z sloot $
$URL: https://ilk.uvt.nl/svn/trunk/sources/Timbl6/src/Metrics.cxx $
Copyright (c) 1998 - 2013
ILK - Tilburg University
CLiPS - University of Antwerp
This file is part of timbl
timbl is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
timbl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, see .
For questions and suggestions, see:
http://ilk.uvt.nl/software.html
or send mail to:
timbl@uvt.nl
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "timbl/Common.h"
#include "timbl/MsgClass.h"
#include "timbl/Types.h"
#include "timbl/Instance.h"
#include "timbl/Metrics.h"
using namespace std;
using namespace TiCC;
using Common::Epsilon;
using Common::Log2;
//#define METRIC_DEBUG
namespace Timbl{
const double maxSimilarity = std::numeric_limits::max();
double lv_distance( const string& source, const string& target ){
// code taken from: http://www.merriampark.com/ldcpp.htm
// Levenshtein Distance Algorithm: C++ Implementation
// by Anders Sewerin Johansen
// Step 1
const size_t n = source.length();
const size_t m = target.length();
if (n == 0) {
return (double)m;
}
if (m == 0) {
return (double)n;
}
// Good form to declare a TYPEDEF
typedef std::vector< std::vector > Tmatrix;
Tmatrix matrix(n+1);
// Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
// allow for allocation on declaration of 2.nd dimension of vec of vec
for ( size_t i = 0; i <= n; ++i ) {
matrix[i].resize(m+1);
}
// Step 2
for ( size_t i = 0; i <= n; ++i ) {
matrix[i][0]=i;
}
for ( size_t i = 0; i <= m; ++i ) {
matrix[0][i]=i;
}
// Step 3
for ( size_t i = 1; i <= n; ++i ) {
const char s_i = source[i-1];
// Step 4
for ( size_t j = 1; j <= m; ++j ) {
const char t_j = target[j-1];
// Step 5
int cost;
if (s_i == t_j) {
cost = 0;
}
else {
cost = 1;
}
// Step 6
const size_t above = matrix[i-1][j];
const size_t left = matrix[i][j-1];
const size_t diag = matrix[i-1][j-1];
size_t cell = min( above + 1, min(left + 1, diag + cost));
// Step 6A: Cover transposition, in addition to deletion,
// insertion and substitution. This step is taken from:
// Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
// Enhanced Dynamic Programming ASM Algorithm"
// (http://www.acm.org/~hlb/publications/asm/asm.html)
if (i>2 && j>2) {
size_t trans=matrix[i-2][j-2]+1;
if (source[i-2]!=t_j) trans++;
if (s_i!=target[j-2]) trans++;
if (cell>trans) cell=trans;
}
matrix[i][j]=cell;
}
}
return (double)matrix[n][m];
}
double dc_distance( const string& string1, const string& string2 ){
// code taken from:
// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Dice's_coefficient
unsigned int ls1 = string1.length();
unsigned int ls2 = string2.length();
double dice;
int overlap = 0;
int total = 0;
if ( ls1 <= 1 || ls2 <= 1 ){
// back-off naar unigrammen
set string1_unigrams;
set string2_unigrams;
for(unsigned int i = 0; i < ls1; i++) {
string1_unigrams.insert(string1[i]);
}
for(unsigned int i = 0; i < ls2; i++) {
string2_unigrams.insert(string2[i]);
}
set::const_iterator it = string2_unigrams.begin();
while ( it != string2_unigrams.end() ){
if ( string1_unigrams.find( *it ) != string1_unigrams.end() )
++overlap;
++it;
}
total = string1_unigrams.size() + string2_unigrams.size();
}
else {
set string1_bigrams;
set string2_bigrams;
for(unsigned int i = 0; i < (ls1 - 1); i++) { // extract character bigrams from string1
string1_bigrams.insert(string1.substr(i, 2));
}
for(unsigned int i = 0; i < (ls2 - 1); i++) { // extract character bigrams from string2
string2_bigrams.insert(string2.substr(i, 2));
}
set::const_iterator it = string2_bigrams.begin();
while ( it != string2_bigrams.end() ){
if ( string1_bigrams.find( *it ) != string1_bigrams.end() )
++overlap;
++ it;
}
total = string1_bigrams.size() + string2_bigrams.size();
}
dice = (double)(overlap * 2) / (double)total;
// we will return 1 - dice coefficient ad distance
dice = 1.0 - dice;
return dice;
}
double vd_distance( SparseValueProbClass *r, SparseValueProbClass *s ){
double result = 0.0;
if ( ! ( r && s ) )
return 1.0;
SparseValueProbClass::IDiterator p1 = r->begin();
SparseValueProbClass::IDiterator p2 = s->begin();
while( p1 != r->end() &&
p2 != s->end() ){
if ( p2->first < p1->first ){
result += p2->second;
++p2;
}
else if ( p2->first == p1->first ){
result += fabs( p1->second - p2->second );
++p1;
++p2;
}
else {
result += p1->second;
++p1;
}
}
while ( p1 != r->end() ){
result += p1->second;
++p1;
}
while ( p2 != s->end() ){
result += p2->second;
++p2;
}
result = result / 2.0;
return result;
}
double p_log_p_div_q( double p, double q ) {
if ( abs(q) < Epsilon )
return 0;
return p * Log2( p/q );
}
double jd_distance( SparseValueProbClass *r, SparseValueProbClass *s ){
double part1 = 0.0;
double part2 = 0.0;
SparseValueProbClass::IDiterator p1 = r->begin();
SparseValueProbClass::IDiterator p2 = s->begin();
while( p1 != r->end() &&
p2 != s->end() ){
if ( p2->first < p1->first ){
part2 += p2->second;
++p2;
}
else if ( p2->first == p1->first ){
part1 += p_log_p_div_q( p1->second, p2->second );
part2 += p_log_p_div_q( p2->second, p1->second );
++p1;
++p2;
}
else {
part1 += p1->second;
++p1;
}
}
while ( p1 != r->end() ){
part1 += p1->second;
++p1;
}
while ( p2 != s->end() ){
part2 += p2->second;
++p2;
}
double result = part1 + part2;
result = result / 2.0;
return result;
}
double k_log_k_div_m( double k, double l ) {
if ( abs(k+l) < Epsilon )
return 0;
return k * Log2( (2.0 * k)/( k + l ) );
}
double js_distance( SparseValueProbClass *r, SparseValueProbClass *s ){
double part1 = 0.0;
double part2 = 0.0;
SparseValueProbClass::IDiterator p1 = r->begin();
SparseValueProbClass::IDiterator p2 = s->begin();
while( p1 != r->end() &&
p2 != s->end() ){
if ( p2->first < p1->first ){
part2 += p2->second;
++p2;
}
else if ( p2->first == p1->first ){
part1 += k_log_k_div_m( p1->second, p2->second );
part2 += k_log_k_div_m( p2->second, p1->second );
++p1;
++p2;
}
else {
part1 += p1->second;
++p1;
}
}
while ( p1 != r->end() ){
part1 += p1->second;
++p1;
}
while ( p2 != s->end() ){
part2 += p2->second;
++p2;
}
double result = part1 + part2;
result = result / 2.0;
return result;
}
metricClass *getMetricClass( MetricType mt ){
switch ( mt ){
case Overlap:
return new OverlapMetric();
break;
case Numeric:
return new NumericMetric();
break;
case Euclidean:
return new EuclideanMetric();
break;
case Cosine:
return new CosineMetric();
break;
case DotProduct:
return new DotProductMetric();
break;
case ValueDiff:
return new ValueDiffMetric();
break;
case JeffreyDiv:
return new JeffreyMetric();
break;
case JSDiv:
return new JSMetric();
break;
case Levenshtein:
return new LevenshteinMetric();
break;
case Dice:
return new DiceMetric();
break;
case Ignore:
return 0;
break;
default:
throw logic_error("getMetricClass: unknown MetricType " + toString(mt) );
}
}
double OverlapMetric::distance( FeatureValue *F,
FeatureValue *G,
size_t,
double ) const {
if ( F == G )
return 0.0;
else
return 1.0;
}
inline bool FV_to_real( FeatureValue *FV, double &result ){
if ( FV ){
if ( stringTo( FV->Name(), result ) )
return true;
}
return false;
}
double JeffreyMetric::distance( FeatureValue *F, FeatureValue *G,
size_t limit,
double ) const {
double result = 0.0;
if ( G != F ){
if ( F->ValFreq() < limit ||
G->ValFreq() < limit ){
#ifdef METRIC_DEBUG
cerr << "result = 1.0 vanwege F.valFreq=" << F->ValFreq()
<< " en G.valFreq()=" << G ->ValFreq()
<< " met limiet= " << limit << endl;
#endif
result = 1.0;
}
else {
result = jd_distance( F->valueClassProb(), G->valueClassProb() );
}
}
return result;
}
double JSMetric::distance( FeatureValue *F, FeatureValue *G,
size_t limit,
double ) const {
double result = 0.0;
if ( G != F ){
if ( F->ValFreq() < limit ||
G->ValFreq() < limit ){
#ifdef METRIC_DEBUG
cerr << "result = 1.0 vanwege F.valFreq=" << F->ValFreq()
<< " en G.valFreq()=" << G ->ValFreq()
<< " met limiet= " << limit << endl;
#endif
result = 1.0;
}
else {
result = js_distance( F->valueClassProb(), G->valueClassProb() );
}
}
return result;
}
double LevenshteinMetric::distance( FeatureValue *F, FeatureValue *G,
size_t, double) const {
double result = 0.0;
if ( G != F ){
result = lv_distance( F->Name(), G->Name() );
}
return result;
}
double DiceMetric::distance( FeatureValue *F, FeatureValue *G,
size_t, double ) const {
double result = 0.0;
if ( G != F ){
result = dc_distance( F->Name(), G->Name() );
}
return result;
}
double ValueDiffMetric::distance( FeatureValue *F, FeatureValue *G,
size_t limit,
double ) const {
double result = 0.0;
if ( G != F ){
if ( F->ValFreq() < limit ||
G->ValFreq() < limit ){
#ifdef METRIC_DEBUG
cerr << "result = 1.0 vanwege F.valFreq=" << F->ValFreq()
<< " en G.valFreq()=" << G ->ValFreq()
<< " met limiet= " << limit << endl;
#endif
result = 1.0;
}
else
result = vd_distance( F->valueClassProb(), G->valueClassProb() );
}
return result;
}
double NumericMetric::distance( FeatureValue *F, FeatureValue *G,
size_t,
double scale ) const {
double r1, r2, result;
if ( FV_to_real( F, r1 ) &&
FV_to_real( G, r2 ) )
result = fabs( (r1-r2)/ ( scale ) );
else
result = 1.0;
return result;
}
double EuclideanMetric::distance( FeatureValue *F, FeatureValue *G,
size_t,
double scale ) const {
double r1, r2, result;
if ( FV_to_real( F, r1 ) &&
FV_to_real( G, r2 ) )
result = sqrt(fabs(r1*r1-r2*r2))/ ( scale );
else
result = 1.0;
return result;
}
double DotProductMetric::distance( FeatureValue *, FeatureValue *,
size_t, double ) const {
throw( logic_error( "unimplemented distance() for Dotproduct metric!" ) );
return -1.0;
}
double CosineMetric::distance( FeatureValue *, FeatureValue *,
size_t, double ) const {
throw( logic_error( "unimplemented distance() for Cosine metric!" ) );
return -1.0;
}
}