/* $Id: Instance.cxx 15828 2013-03-28 11:55:53Z sloot $ $URL: https://ilk.uvt.nl/svn/trunk/sources/Timbl6/src/Instance.cxx $ Copyright (c) 1998 - 2013 ILK - Tilburg University CLiPS - University of Antwerp This file is part of timbl timbl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. timbl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . For questions and suggestions, see: http://ilk.uvt.nl/software.html or send mail to: timbl@uvt.nl */ #include #include #include #include #include #include #include #include #include #include #include #include #include "ticcutils/StringOps.h" #include "ticcutils/TreeHash.h" #include "timbl/Common.h" #include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" #include "timbl/Metrics.h" using namespace std; using namespace TiCC; namespace Timbl { using namespace Common; size_t Vfield::Index() { return value->Index(); } ostream& operator<<(ostream& os, const Vfield *vd ) { return vd->put( os ); } ostream& operator<<(ostream& os, const Vfield& vd ) { return vd.put( os ); } ostream& Vfield::put( ostream& os ) const { os << value << " " << weight; return os; } inline int random_number( int Min, int Max ){ // calculate a random integer within the interval [min,max] if ( Min == Max ) return Min; double randnum = (double)rand()/(double)RAND_MAX; randnum *= (Max-Min); randnum += Min; return (int)floor(randnum+0.5); } void ValueDistribution::clear(){ VDlist::iterator it; for ( it = distribution.begin(); it != distribution.end(); ++it ) delete it->second; distribution.clear(); total_items = 0; } double ValueDistribution::Confidence( const TargetValue *tv ) const { VDlist::const_iterator it = distribution.begin(); while ( it != distribution.end() ){ if ( it->second->Value() == tv ) return it->second->Weight(); ++it; } return 0.0; } void ValueDistribution::DistToString( string& DistStr, double minf ) const { ostringstream oss; VDlist::const_iterator it = distribution.begin(); oss.setf(ios::showpoint); bool first = true; oss << "{ "; while ( it != distribution.end() ){ Vfield *f = it->second; if ( f->frequency >= minf ){ if ( !first ) oss << ", "; oss << f->value << " " << double(f->frequency); first = false; } ++it; } oss << " }"; DistStr = oss.str(); } void WValueDistribution::DistToString( string& DistStr, double minw ) const { ostringstream oss; VDlist::const_iterator it = distribution.begin(); oss.setf(ios::showpoint); bool first = true; oss << "{ "; while ( it != distribution.end() ){ Vfield *f = it->second; if ( abs(f->weight) < minw ){ ++it; continue; } if ( abs(f->weight) < Epsilon ){ ++it; continue; } if ( !first ) oss << ", "; oss << f->value << " " << f->weight; first = false; ++it; } oss << " }"; DistStr = oss.str(); } class dblCmp { public: bool operator() ( const double d1, const double d2 ) const { return d1 - d2 > Epsilon; } }; void ValueDistribution::DistToStringWW( string& DistStr, int beam ) const { double minw = 0.0; if ( beam > 0 ){ std::set freqs; VDlist::const_iterator it = distribution.begin(); while ( it != distribution.end() ){ Vfield *f = it->second; freqs.insert( f->frequency ); ++it; } int cnt=0; std::set::iterator rit = freqs.begin(); while ( rit != freqs.end() && cnt < beam ) { ++cnt; if ( cnt < beam ) ++rit; } if ( cnt == beam ) minw = *rit; } DistToString( DistStr, minw ); } void WValueDistribution::DistToStringWW( string& DistStr, int beam ) const { double minw = 0.0; if ( beam > 0 ){ std::set wgths; VDlist::const_iterator it = distribution.begin(); while ( it != distribution.end() ){ Vfield *f = it->second; wgths.insert( f->weight ); ++it; } int cnt=0; std::set::iterator rit = wgths.begin(); while ( rit != wgths.end() && cnt < beam ) { ++cnt; if ( cnt < beam ) ++rit; } if ( cnt == beam ) minw = *rit; } DistToString( DistStr, minw ); } const string ValueDistribution::DistToString() const { string result; DistToString( result ); return result; } const string ValueDistribution::DistToStringW( int beam ) const { string result; DistToStringWW( result, beam ); return result; } double ValueDistribution::Entropy() const { double Prob = 0.0; double entropy = 0.0; size_t TotalVals = total_items; if ( TotalVals > 0 ){ VDlist::const_iterator it = distribution.begin(); // Loop over the classes in the distibution while ( it != distribution.end() ){ size_t Freq = it->second->Freq(); if ( Freq > 0 ){ Prob = Freq / (double)TotalVals; entropy += Prob * Log2(Prob); } ++it; } } return fabs(entropy); } void WValueDistribution::Normalize() { double sum = 0.0; VDlist::iterator it = distribution.begin(); while ( it != distribution.end() ){ sum += it->second->Weight(); ++it; } it = distribution.begin(); while ( it != distribution.end() ){ it->second->SetWeight( it->second->Weight() / sum ); ++it; } } void WValueDistribution::Normalize_1( double factor, const Target *targ ) { for ( unsigned int i=0; i < targ->ValuesArray.size(); ++i ){ // search for val, if not there: add entry with frequency factor; // otherwise increment the ExamplarWeight TargetValue *val = (TargetValue*)targ->ValuesArray[i]; size_t id = val->Index(); VDlist::const_iterator it = distribution.find( id ); if ( it != distribution.end() ){ it->second->SetWeight( it->second->Weight() + factor ); } else { distribution[id] = new Vfield( val, 1, factor ); } } total_items += targ->ValuesArray.size(); Normalize(); } void WValueDistribution::Normalize_2( ) { VDlist::iterator it = distribution.begin(); while ( it != distribution.end() ){ it->second->SetWeight( log( it->second->Weight() + 1 )); ++it; } Normalize(); } ValueDistribution *ValueDistribution::to_VD_Copy( ) const { ValueDistribution *res = new ValueDistribution(); size_t key; Vfield *vdf; VDlist::const_iterator It = distribution.begin(); while ( It != distribution.end() ){ key = It->first; vdf = It->second; res->distribution[key] = new Vfield( vdf->Value(), vdf->Freq(), vdf->Freq() ); ++It; } res->total_items = total_items; return res; } WValueDistribution *ValueDistribution::to_WVD_Copy() const { WValueDistribution *res = new WValueDistribution(); size_t key; Vfield *vdf; VDlist::const_iterator It = distribution.begin(); while ( It != distribution.end() ){ key = It->first; vdf = It->second; res->distribution[key] = new Vfield( vdf->Value(), vdf->Freq(), vdf->Freq() ); ++It; } res->total_items = total_items; return res; } WValueDistribution *WValueDistribution::to_WVD_Copy( ) const { WValueDistribution *result = new WValueDistribution(); VDlist::const_iterator it = distribution.begin(); Vfield *vdf; size_t key; while ( it != distribution.end() ){ key = it->first; vdf = it->second; result->distribution[key] = new Vfield( vdf->Value(), vdf->Freq(), vdf->Weight() ); ++it; } result->total_items = total_items; return result; } // // special functions to serialize distibutions including both frequency // AND weight information. Needed for store/retrieve InstanceBases // // First hashed variant: // const string ValueDistribution::SaveHashed() const{ ostringstream oss; VDlist::const_iterator it = distribution.begin(); oss << "{ "; bool first = true; while ( oss.good() && it != distribution.end() ){ Vfield *f = it->second; if ( f->frequency > 0 ){ if ( !first ) oss << ", "; oss << f->value->Index() << " " << f->frequency; first = false; } ++it; } oss << " }"; return oss.str(); } const string WValueDistribution::SaveHashed() const{ ostringstream oss; VDlist::const_iterator it = distribution.begin(); bool first = true; oss << "{ "; while ( oss.good() && it != distribution.end() ){ Vfield *f = it->second; if ( f->frequency > 0 ){ if ( !first ) oss << ", "; oss << f->Value()->Index() << " " << f->frequency << " " << f->weight; first = false; } ++it; } oss << " }"; return oss.str(); } // // non-hashed variant: // const string ValueDistribution::Save() const{ ostringstream oss; VDlist::const_iterator it = distribution.begin(); oss << "{ "; bool first = true; while ( oss.good() && it != distribution.end() ){ Vfield *f = it->second; if ( f->frequency > 0 ){ if ( !first ) oss << ", "; oss << f->value << " " << f->frequency; first = false; } ++it; } oss << " }"; return oss.str(); } const string WValueDistribution::Save() const{ ostringstream oss; VDlist::const_iterator it = distribution.begin(); oss << "{ "; bool first = true; while ( oss.good() && it != distribution.end() ){ Vfield *f = it->second; if ( f->frequency > 0 ){ if ( !first ) oss << ", "; oss.setf(ios::showpoint); oss << f->value << " " << f->frequency << " " << f->weight; first = false; } ++it; } oss << " }"; return oss.str(); } void ValueDistribution::SetFreq( const TargetValue *val, const int freq, double ){ // add entry with frequency freq; Vfield *temp = new Vfield( val, freq, freq ); distribution[val->Index()] = temp; total_items += freq; } void WValueDistribution::SetFreq( const TargetValue *val, const int freq, double sw ){ // add entry with frequency freq; // also sets the sample_weight Vfield *temp = new Vfield( val, freq, sw ); distribution[val->Index()] = temp; total_items += freq; } bool ValueDistribution::IncFreq( const TargetValue *val, size_t occ, double ){ // search for val, if not there: add entry with frequency 'occ'; // otherwise increment the freqency size_t id = val->Index(); VDlist::const_iterator it = distribution.find( id ); if ( it != distribution.end() ){ it->second->IncFreq( occ ); } else distribution[id] = new Vfield( val, occ, 1.0 ); total_items += occ; return true; } bool WValueDistribution::IncFreq( const TargetValue *val, size_t occ, double sw ){ // search for val, if not there: add entry with frequency 'occ'; // otherwise increment the freqency // also set sample weight size_t id = val->Index(); VDlist::const_iterator it = distribution.find( id ); if ( it != distribution.end() ){ it->second->IncFreq( occ ); } else { distribution[id] = new Vfield( val, occ, sw ); } total_items += occ; return fabs( distribution[id]->Weight() - sw ) > Epsilon; } void ValueDistribution::DecFreq( const TargetValue *val ){ // search for val, if not there, just forget // otherwise decrement the freqency VDlist::const_iterator it = distribution.find( val->Index() ); if ( it != distribution.end() ){ it->second->DecFreq(); total_items -= 1; } } void ValueDistribution::Merge( const ValueDistribution& VD ){ VDlist::const_iterator It = VD.distribution.begin(); size_t key; Vfield *vd; while ( It != VD.distribution.end() ){ key = It->first; vd = It->second; if ( distribution.find(key) != distribution.end() ){ distribution[key]->AddFreq( vd->Freq() ); } else // VD might be weighted. But we don't need/want that info here // Weight == Freq is more convenient distribution[key] = new Vfield( vd->Value(), vd->Freq(), vd->Freq() ); ++It; } total_items += VD.total_items; } void WValueDistribution::MergeW( const ValueDistribution& VD, double Weight ){ VDlist::const_iterator It = VD.distribution.begin(); while ( It != VD.distribution.end() ){ Vfield *vd = It->second; size_t key = It->first; if ( distribution.find(key) != distribution.end() ){ distribution[key]->SetWeight( distribution[key]->Weight() + vd->Weight() *Weight ); } else { distribution[key] = new Vfield( vd->Value(), 1, vd->Weight() * Weight); } ++It; } total_items += VD.total_items; } const TargetValue *ValueDistribution::BestTarget( bool& tie, bool do_rand ) const { // get the most frequent target from the distribution. // In case of a tie take the one which is GLOBALLY the most frequent, // OR (if do_rand) take random one of the most frequents // and signal if this ties also! const TargetValue *best = NULL; tie = false; VDlist::const_iterator It = distribution.begin(); if ( It != distribution.end() ){ Vfield *pnt = It->second; size_t Max = pnt->Freq(); if ( do_rand ){ int nof_best=1, pick=1; ++It; while ( It != distribution.end() ){ pnt = It->second; if ( pnt->Freq() > Max ){ Max = pnt->Freq(); nof_best = 1; } else if ( pnt->Freq() == Max ) nof_best++; ++It; } tie = ( nof_best > 1 ); pick = random_number( 1, nof_best ); It = distribution.begin(); nof_best = 0; while ( It != distribution.end() ){ pnt = It->second; if ( pnt->Freq() == Max ) if ( ++nof_best == pick ){ return pnt->Value(); } ++It; } return NULL; } else { best = pnt->Value(); ++It; while ( It != distribution.end() ){ pnt = It->second; if ( pnt->Freq() > Max ){ tie = false; best = pnt->Value(); Max = pnt->Freq(); } else if ( pnt->Freq() == Max ) { tie = true; if ( pnt->Value()->ValFreq() > best->ValFreq() ){ best = pnt->Value(); } } ++It; } return best; } } return best; } const TargetValue *WValueDistribution::BestTarget( bool& tie, bool do_rand ) const { // get the most frequent target from the distribution. // In case of a tie take the one which is GLOBALLY the most frequent, // OR (if do_rand) take random one of the most frequents // and signal if this ties also! const TargetValue *best = NULL; VDlist::const_iterator It = distribution.begin(); tie = false; if ( It != distribution.end() ){ double Max = It->second->Weight(); if ( do_rand ){ int nof_best=1, pick=1; ++It; while ( It != distribution.end() ){ if ( It->second->Weight() > Max ){ Max = It->second->Weight(); nof_best = 1; } else if ( abs(It->second->Weight()- Max) < Epsilon ) nof_best++; ++It; } tie = ( nof_best > 1 ); pick = random_number( 1, nof_best ); It = distribution.begin(); nof_best = 0; while ( It != distribution.end() ){ if ( abs(It->second->Weight() - Max) < Epsilon ) if ( ++nof_best == pick ){ return It->second->Value(); } ++It; } return NULL; } else { best = It->second->Value(); ++It; while ( It != distribution.end() ){ if ( It->second->Weight() > Max ){ tie = false; best = It->second->Value(); Max = It->second->Weight(); } else if ( abs(It->second->Weight() - Max) < Epsilon ) { tie = true; if ( It->second->Value()->ValFreq() > best->ValFreq() ){ best = It->second->Value(); } } ++It; } return best; } } return best; } Feature::Feature( Hash::StringHash *T ): BaseFeatTargClass(T), metric_matrix( 0 ), metric( 0 ), ignore( false ), numeric( false ), vcpb_read( false ), PrestoreStatus(ps_undef), Prestored_metric( UnknownMetric ), entropy( 0.0 ), info_gain (0.0), split_info(0.0), gain_ratio(0.0), chi_square(0.0), shared_variance(0.0), standard_deviation(0.0), matrix_clip_freq(10), n_dot_j( 0 ), n_i_dot( 0 ), n_min (0.0), n_max (0.0), SaveSize(0), SaveNum(0), weight(0.0) {} Feature::Feature( const Feature& in ): BaseFeatTargClass( in ){ *this = in; } Feature& Feature::operator=( const Feature& in ){ if ( this != &in ){ metric_matrix = in.metric_matrix; metric = in.metric; PrestoreStatus = in.PrestoreStatus; Prestored_metric = in.Prestored_metric; ignore = in.ignore; numeric = in.numeric; vcpb_read = in.vcpb_read; entropy = in.entropy; info_gain = in.info_gain; split_info = in.split_info; gain_ratio = in.gain_ratio; chi_square = in.chi_square; shared_variance = in.shared_variance; standard_deviation = in.standard_deviation; matrix_clip_freq = in.matrix_clip_freq; n_dot_j = in.n_dot_j; n_i_dot = in.n_i_dot; n_min = in.n_min; n_max = in.n_max; SaveSize = in.SaveSize; SaveNum = in.SaveNum; weight = in.weight; } return *this; } void Feature::InitSparseArrays(){ if ( !is_copy ){ // Loop over all values. // VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FeatureValue *FV = (FeatureValue*)*it; size_t freq = FV->ValFreq(); FV->ValueClassProb->Clear(); if ( freq > 0 ){ // Loop over all present classes. // ValueDistribution::dist_iterator It = FV->TargetDist.begin(); while ( It != FV->TargetDist.end() ){ FV->ValueClassProb->Assign( It->second->Index(), It->second->Freq()/(double)freq ); ++It; } } ++it; } } } struct D_D { D_D(){ dist = NULL; value = 0.0; }; D_D( FeatureValue *fv ){ if ( !stringTo( fv->Name(), value ) ) throw( logic_error("called DD with an non-numeric value" ) ); dist = &fv->TargetDist; } ValueDistribution *dist; double value; }; bool dd_less( const D_D* dd1, const D_D* dd2 ){ return dd1->value < dd2->value; } void Feature::NumStatistics( vector& FVBin, double DBentropy, int BinSize ){ double Prob, FVEntropy; size_t TotalVals = TotalValues(); entropy = 0.0; vector ddv; VCarrtype::const_iterator it = ValuesArray.begin(); ddv.reserve( ValuesArray.size() ); while ( it != ValuesArray.end() ){ FeatureValue *FV = (FeatureValue*)*it; if ( FV->ValFreq() > 0 ){ ddv.push_back( new D_D( FV ) ); } ++it; } sort( ddv.begin(), ddv.end(), dd_less ); size_t dd_len = ddv.size(); int num_per_bin = (int)floor( (double)dd_len / BinSize); size_t rest = dd_len - num_per_bin * BinSize; if ( rest ) num_per_bin++; int jj = 0; int cnt = 0; for ( size_t m = 0; m < dd_len; ++m ){ FVBin[jj]->TargetDist.Merge( *ddv[m]->dist ); if ( ++cnt >= num_per_bin ){ ++jj; if ( --rest == 0 ) --num_per_bin; cnt = 0; } } for ( size_t j=0; j < dd_len; ++j ){ delete ddv[j]; } for ( int k=0; k < BinSize; k++ ){ FeatureValue *pnt = FVBin[k]; size_t Freq = pnt->TargetDist.totalSize(); pnt->ValFreq( Freq ); if ( Freq > 0 ){ // Entropy for this FV pair. // FVEntropy = 0.0; ValueDistribution::dist_iterator It = pnt->TargetDist.begin(); while ( It != pnt->TargetDist.end() ){ Prob = It->second->Freq()/(double)Freq; FVEntropy += Prob * Log2(Prob); ++It; } entropy += -FVEntropy * Freq / (double)TotalVals; } } entropy = fabs( entropy ); // Info gain. // info_gain = DBentropy - entropy; // And the split info. // split_info = 0.0; for ( int l=0; l < BinSize; ++l ){ size_t Freq = FVBin[l]->ValFreq(); if ( Freq > 0 ){ Prob = Freq / (double)TotalVals; split_info += Prob * Log2(Prob); } } split_info = -split_info; // Gain ratio. // if ( fabs(split_info) FVBin(BinSize); for ( int i=0; i < BinSize; ++i ){ sprintf( dumname, "dum%d", i ); FVBin[i] = new FeatureValue( dumname ); } NumStatistics( FVBin, DBentropy, BinSize ); if ( full ){ ChiSquareStatistics( FVBin, BinSize, Targets ); int cnt = 0; // count effective values in Bin for( int i=0; i < BinSize; ++i ){ if ( FVBin[i]->ValFreq() > 0 ) cnt++; } SharedVarianceStatistics( Targets, cnt ); } for( int i=0; i < BinSize; ++i ){ delete FVBin[i]; } } void Feature::Statistics( double DBentropy ){ double Prob = 0.0; size_t TotalVals = TotalValues(); entropy = 0.0; // Loop over the values. VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FeatureValue *fv = (FeatureValue*)*it; // Entropy for this FV pair. double FVEntropy = 0.0; size_t Freq = fv->ValFreq(); if ( Freq > 0 ){ ValueDistribution::dist_iterator It = fv->TargetDist.begin(); while ( It != fv->TargetDist.end() ){ Prob = It->second->Freq() / (double)Freq; FVEntropy += Prob * Log2(Prob); ++It; } entropy += -FVEntropy * Freq / (double)TotalVals; } ++it; } entropy = fabs( entropy ); // Info. gain. // info_gain = DBentropy - entropy; if ( info_gain < 0.0 ) info_gain = 0.0; // And the split. info. // split_info = 0.0; it = ValuesArray.begin(); while( it != ValuesArray.end() ){ FeatureValue *fv = (FeatureValue*)*it; Prob = fv->ValFreq() / (double)TotalVals; if ( Prob > 0 ) { split_info += Prob * Log2(Prob); } ++it; } split_info = -split_info; // Gain ratio. // if ( fabs(split_info) < Epsilon ) gain_ratio = 0.0; else gain_ratio = info_gain / split_info; } void Feature::ChiSquareStatistics( vector& FVA, size_t Num_Vals, Target *Targets ){ chi_square = 0.0; long int n_dot_dot = 0; double tmp; size_t Size = Targets->ValuesArray.size(); if ( !n_dot_j ) { n_dot_j = new long int[Size]; n_i_dot = new long int[Num_Vals]; SaveSize = Size; SaveNum = Num_Vals; } else { if ( SaveSize < Size ){ delete [] n_dot_j; n_dot_j = new long int[Size]; SaveSize = Size; } if ( SaveNum < Num_Vals ){ delete [] n_i_dot; n_i_dot = new long int[Num_Vals]; SaveNum = Num_Vals; } } for ( size_t j = 0; j < Size; ++j ){ n_dot_j[j] = 0; } ValueDistribution::dist_iterator It; for ( size_t i = 0; i < Num_Vals; ++i ){ n_i_dot[i] = 0; FeatureValue *fv = FVA[i]; It = fv->TargetDist.begin(); while ( It != fv->TargetDist.end() ){ n_dot_j[It->second->Index()-1] += It->second->Freq(); n_i_dot[i] += It->second->Freq(); ++It; } n_dot_dot += n_i_dot[i]; } if ( n_dot_dot != 0 ){ for ( size_t m = 0; m < Num_Vals; ++m ){ FeatureValue *fv = FVA[m]; It = fv->TargetDist.begin(); size_t n = 0; while ( It != fv->TargetDist.end() && n < Size ){ while ( n < It->second->Index()-1 ){ tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / (double)n_dot_dot; chi_square += tmp; } if ( n == It->second->Index()-1 ){ tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / (double)n_dot_dot; if ( fabs(tmp) > Epsilon){ chi_square += ( (tmp - It->second->Freq()) * (tmp - It->second->Freq()) ) / tmp; } ++It; } else break; } while ( n < Size ){ tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / (double)n_dot_dot; chi_square += tmp; } } } } void Feature::ChiSquareStatistics( Target *Targets ){ chi_square = 0.0; long int n_dot_dot = 0; double tmp; size_t Size = Targets->ValuesArray.size(); size_t Num_Vals = ValuesArray.size(); if ( !n_dot_j ) { n_dot_j = new long int[Size]; n_i_dot = new long int[Num_Vals]; SaveSize = Size; SaveNum = Num_Vals; } else { if ( SaveSize < Size ){ delete [] n_dot_j; n_dot_j = new long int[Size]; SaveSize = Size; } if ( SaveNum < Num_Vals ){ delete [] n_i_dot; n_i_dot = new long int[Num_Vals]; SaveNum = Num_Vals; } } for ( size_t j = 0; j < Size; ++j ){ n_dot_j[j] = 0; } ValueDistribution::dist_iterator It; VCarrtype::const_iterator it = ValuesArray.begin(); int i = 0; while ( it != ValuesArray.end() ){ n_i_dot[i] = 0; FeatureValue *fv = (FeatureValue *)*it; It = fv->TargetDist.begin(); while ( It != fv->TargetDist.end() ){ long int fr = It->second->Freq(); n_dot_j[It->second->Index()-1] += fr; n_i_dot[i] += fr; ++It; } n_dot_dot += n_i_dot[i]; ++it; ++i; } if ( n_dot_dot != 0 ){ it = ValuesArray.begin(); int m = 0; while ( it != ValuesArray.end() ){ FeatureValue *fv = (FeatureValue *)*it; It = fv->TargetDist.begin(); size_t n = 0; while ( It != fv->TargetDist.end() && n < Size ){ size_t id = It->second->Index()-1; long int fr = It->second->Freq(); while ( n < id ){ tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / (double)n_dot_dot; chi_square += tmp; } if ( n == id ){ tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / (double)n_dot_dot; if ( fabs(tmp) > Epsilon ){ chi_square += ( (tmp - fr ) * (tmp - fr ) ) / tmp; } ++It; } else break; } while ( n < Size ){ tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / (double)n_dot_dot; chi_square += tmp; } ++it; ++m; } } } double Feature::fvDistance( FeatureValue *F, FeatureValue *G, size_t limit ) const { bool dummy; double result = 0.0; if ( F != G ){ if ( metric->isStorable() && matrixPresent( dummy )&& F->ValFreq() >= matrix_clip_freq && G->ValFreq() >= matrix_clip_freq ){ result = metric_matrix->Extract( F, G ); } else if ( metric->isNumerical() ) { result = metric->distance( F, G, limit, Max() - Min() ); } else { result = metric->distance( F, G, limit ); } } return result; } ostream& operator<<(ostream& os, const ValueDistribution& vd ) { string tmp; vd.DistToString( tmp ); os << tmp; return os; } ostream& operator<<(ostream& os, const ValueDistribution *vd ) { string tmp = "{null}"; if( vd ) vd->DistToString( tmp ); os << tmp; return os; } ValueDistribution *ValueDistribution::read_distribution( istream &is, Target *Targ, bool do_fr ){ // read a distribution from stream is into Target // if do_f we also adjust the value of Frequency of the Target, which is // otherwise 1. Special case when reading the TopDistribution. // ValueDistribution *result = 0; TargetValue *target; string buf; size_t freq; double sw; char nextCh; is >> nextCh; // skip { if ( nextCh != '{' ){ throw string( "missing '{' " ); } else { int next; do { is >> ws >> buf; is >> freq; if ( do_fr ){ target = Targ->add_value( buf, freq ); } else target = Targ->Lookup( buf ); if ( !target ){ delete result; result = 0; break; } next = look_ahead(is); if ( next == ',' ){ if ( !result ) result = new ValueDistribution(); result->SetFreq( target, freq ); is >> nextCh; next = look_ahead(is); } else if ( next == '}' ){ if ( !result ) result = new ValueDistribution(); result->SetFreq( target, freq ); } else if ( isdigit(next) ){ if ( !result ) result = new WValueDistribution(); is >> sw; result->SetFreq( target, freq, sw ); next = look_ahead(is); if ( next == ',' ){ is >> nextCh; next = look_ahead(is); } } } while ( is && next != '}' ); if ( is ) is >> nextCh; // skip } else { delete result; throw string( "missing '}' " ); } } return result; } ValueDistribution *ValueDistribution::read_distribution_hashed( istream &is, Target *Targ, bool do_fr ){ ValueDistribution *result = 0; // read a distribution from stream is into Target // if do_f we also adjust the value of Frequency of the Target, which is // otherwise 1. Special case when reading the TopDistribution. // TargetValue *target; size_t freq; unsigned int index; double sw; char nextCh; is >> nextCh; // skip { if ( nextCh != '{' ){ throw string( "missing '{' " ); } else { int next; do { is >> index; is >> freq; if ( do_fr ){ target = Targ->add_value( index, freq ); } else target = Targ->ReverseLookup( index ); if ( !target ){ delete result; result = 0; break; } next = look_ahead(is); if ( next == ',' ){ if ( !result ) result = new ValueDistribution(); result->SetFreq( target, freq ); is >> nextCh; next = look_ahead(is); } else if ( next == '}' ){ if ( !result ) result = new ValueDistribution(); result->SetFreq( target, freq ); } else if ( isdigit(next) ){ is >> sw; if ( !result ) result = new WValueDistribution(); result->SetFreq( target, freq, sw ); next = look_ahead(is); if ( next == ',' ){ is >> nextCh; next = look_ahead(is); } } } while ( is && next != '}' ); if ( is ) is >> nextCh; // skip } else { delete result; throw string( "missing '}' " ); } } return result; } ostream& operator<<( std::ostream& os, ValueClass const *vc ){ if ( vc ){ os << vc->Name(); } else os << "*FV-NF*"; return os; } FeatureValue::FeatureValue( const std::string& value, size_t hash_val ): ValueClass( value, hash_val ), ValueClassProb( NULL ) { } FeatureValue::FeatureValue( const string& s ): ValueClass( s, 0 ), ValueClassProb(NULL){ Frequency = 0; } FeatureValue::~FeatureValue( ){ delete ValueClassProb; } TargetValue::TargetValue( const std::string& value, size_t value_hash ): ValueClass( value, value_hash ){} size_t BaseFeatTargClass::EffectiveValues() const { size_t result = 0; VCarrtype::const_iterator it = ValuesArray.begin(); while( it != ValuesArray.end() ){ if ( (*it)->ValFreq() > 0 ) ++result; ++it; } return result; } size_t BaseFeatTargClass::TotalValues() const { size_t result = 0; VCarrtype::const_iterator it = ValuesArray.begin(); while( it != ValuesArray.end() ){ result += (*it)->ValFreq(); ++it; } return result; } FeatureValue *Feature::Lookup( const string& str ) const { FeatureValue *result = NULL; unsigned int index = TokenTree->Lookup( str ); if ( index ) { IVCmaptype::const_iterator it = ValuesMap.find( index ); if ( it != ValuesMap.end() ) result = (FeatureValue *)(it->second); } return result; } FeatureValue *Feature::add_value( const string& valstr, TargetValue *tv, int freq ){ unsigned int hash_val = TokenTree->Hash( valstr ); return add_value( hash_val, tv, freq ); } FeatureValue *Feature::add_value( size_t index, TargetValue *tv, int freq ){ IVCmaptype::const_iterator it = ValuesMap.find( index ); if( it == ValuesMap.end() ){ const string& value = TokenTree->ReverseLookup( index ); // we want to store the singleton value for this index // so we MUST reverse lookup the index FeatureValue *fv = new FeatureValue( value, index ); fv->ValFreq( freq ); ValuesMap[index] = fv; ValuesArray.push_back( fv ); } else { it->second->IncValFreq( freq ); } FeatureValue *result = (FeatureValue *)ValuesMap[index]; if ( tv ) result->TargetDist.IncFreq(tv, freq ); return result; } bool Feature::increment_value( FeatureValue *FV, TargetValue *tv ){ bool result = false; if ( FV ){ FV->incr_val_freq(); if ( tv ) FV->TargetDist.IncFreq(tv,1); result = true; } return result; } bool Feature::decrement_value( FeatureValue *FV, TargetValue *tv ){ bool result = false; if ( FV ){ FV->decr_val_freq(); if ( tv ) FV->TargetDist.DecFreq(tv); result = true; } return result; } bool Feature::AllocSparseArrays( size_t Dim ){ // Loop over all values. // VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FeatureValue *FV = (FeatureValue*)*it; // Loop over all classes. if ( FV->ValueClassProb == NULL ){ if ( !(FV->ValueClassProb = new SparseValueProbClass( Dim )) ){ return false; } } ++it; } return true; } bool Feature::isNumerical() const { if ( metric && metric->isNumerical() ) return true; else return false; } bool Feature::isStorableMetric() const { if ( metric && metric->isStorable() ) return true; else return false; } BaseFeatTargClass::BaseFeatTargClass( Hash::StringHash *T ): TokenTree( T ), is_copy(false){ } BaseFeatTargClass::BaseFeatTargClass( const BaseFeatTargClass& in ): MsgClass( in ), TokenTree( in.TokenTree ){ ValuesArray = in.ValuesArray; ValuesMap = in.ValuesMap; is_copy = true; } BaseFeatTargClass::~BaseFeatTargClass(){ if ( !is_copy ){ VCarrtype::iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ delete *it; ++it; } } ValuesMap.clear(); } TargetValue *Target::Lookup( const string& str ) const { TargetValue *result = 0; size_t index = TokenTree->Lookup( str ); if ( index ) { IVCmaptype::const_iterator it = ValuesMap.find( index ); result = (TargetValue *)it->second; } return result; } TargetValue *Target::ReverseLookup( size_t index ) const { IVCmaptype::const_iterator it = ValuesMap.find( index ); return dynamic_cast(it->second); } Feature::~Feature(){ if ( !is_copy ){ if ( n_dot_j ) { delete [] n_dot_j; delete [] n_i_dot; } delete_matrix(); delete metric; } } bool Feature::matrixPresent( bool& isRead ) const { isRead = false; if ( metric_matrix != 0 ){ if ( PrestoreStatus == ps_ok ) return true; else if ( PrestoreStatus == ps_read ){ isRead = true; return true; } } return false; } size_t Feature::matrix_byte_size() const { if ( metric_matrix ) return metric_matrix->NumBytes(); else return 0; } FeatVal_Stat Feature::prepare_numeric_stats(){ double tmp; size_t freq; bool first = true; VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FeatureValue *fv = (FeatureValue*)*it; freq = fv->ValFreq(); if ( freq > 0 ){ if ( !stringTo( fv->Name(), tmp ) ){ Warning( "a Non Numeric value '" + string(fv->Name()) + "' in Numeric Feature!" ); return NotNumeric; } if ( first ){ first = false; n_min = tmp; n_max = tmp; } else if ( tmp < n_min ) n_min = tmp; else if ( tmp > n_max ) n_max = tmp; } ++it; } if ( fabs(n_max - n_min) < Epsilon ) return SingletonNumeric; else return NumericValue; } inline int min( int i1, int i2 ) { return (i1>i2?i2:i1); } inline size_t min( size_t i1, size_t i2 ) { return (i1>i2?i2:i1); } void Feature::SharedVarianceStatistics( Target *Targ, int eff_cnt ){ size_t NumInst = Targ->TotalValues(); int NumCats = Targ->EffectiveValues(); int k = min( NumCats, eff_cnt ) - 1; if ( k == 0 || NumInst == 0 ) shared_variance = 0; else shared_variance = chi_square / (double)( NumInst * k ); } void Feature::StandardDeviationStatistics( ){ double sum = 0.0; vector store( ValuesArray.size() ); for ( unsigned int i=0; i < ValuesArray.size(); ++i ){ FeatureValue *FV = (FeatureValue *)ValuesArray[i]; double val = stringTo( FV->Name() ); store[i] = val; sum += val; } double total = 0.0; for ( unsigned int i=0; i < ValuesArray.size(); ++i ){ double diff = sum - store[i]; total += diff*diff; } standard_deviation = sqrt( total / ValuesArray.size() ); } void Feature::clear_matrix(){ if ( PrestoreStatus == ps_read ) return; else delete_matrix(); } void Feature::delete_matrix(){ if ( metric_matrix ){ metric_matrix->Clear(); delete metric_matrix; } metric_matrix = 0; PrestoreStatus = ps_undef; } bool Feature::setMetricType( const MetricType M ){ if ( !metric || ( metric && M != metric->type() ) ){ delete metric; metric = getMetricClass(M); return true; } else { return false; } } MetricType Feature::getMetricType() const { return metric->type(); } bool Feature::store_matrix( int limit){ // // Store a complete distance matrix. // if ( PrestoreStatus == ps_read ) return true; if ( !metric_matrix ) metric_matrix = new SparseSymetricMatrix(); if ( PrestoreStatus != ps_failed && metric->isStorable( ) ) { try { for ( unsigned int ii=0; ii < ValuesArray.size(); ++ii ){ FeatureValue *FV_i = (FeatureValue *)ValuesArray[ii]; for ( unsigned int jj=0; jj < ValuesArray.size(); ++jj ){ FeatureValue *FV_j = (FeatureValue *)ValuesArray[jj]; if ( FV_i->ValFreq() >= matrix_clip_freq && FV_j->ValFreq() >= matrix_clip_freq && ( Prestored_metric != metric->type() || fabs(metric_matrix->Extract(FV_i,FV_j)) < Epsilon ) ){ double dist = metric->distance( FV_i, FV_j, limit ); metric_matrix->Assign( FV_i, FV_j, dist ); } } } } catch( ... ){ cout << "hit the ground!" << endl; PrestoreStatus = ps_failed; return false; }; PrestoreStatus = ps_ok; } if ( PrestoreStatus == ps_ok ){ Prestored_metric = metric->type(); } return true; } ostream& operator<< (std::ostream& os, SparseValueProbClass *VPC ){ if ( VPC ) { int old_prec = os.precision(); os.precision(3); os.setf( std::ios::fixed ); SparseValueProbClass::IDmaptype::const_iterator it = VPC->vc_map.begin(); for ( size_t k = 1; k <= VPC->dimension; ++k ){ os.setf(std::ios::right, std::ios::adjustfield); if ( it != VPC->vc_map.end() && it->first == k ){ os << "\t" << it->second; ++it; } else os << "\t" << 0.0; } os << setprecision( old_prec ); } else os << "(Null SA)"; return os; } void Feature::print_vc_pb_array( ostream &os ) const { VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FeatureValue *FV = (FeatureValue*)*it; if ( FV->ValueClassProb ){ os << FV << FV->ValueClassProb << endl; } ++it; } } bool Feature::read_vc_pb_array( istream &is ){ const char*p; unsigned int Num = 0; FeatureValue *FV; bool first = true; // clear all existing arrays VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FV = (FeatureValue*)*it; if ( FV->ValueClassProb ){ delete FV->ValueClassProb; FV->ValueClassProb = NULL; } ++it; } string name; string buf; while ( getline( is, buf ) ){ if ( buf.length() < 8 ){ // "empty" line separates matrices break; } if ( first ){ p = buf.c_str(); while ( *p && isspace(*p) ) ++p; // skip whitespace while ( *p && !isspace(*p) ) ++p; // skip the featurename while ( *p && isspace(*p) ){ // move along counting non space items p++; // found something non-space Num++; // so increment counter while( *p && !isspace(*p) ) p++; // skip it } first = false; } p = buf.c_str(); // restart name = ""; while ( *p && isspace(*p) ) ++p; // skip whitespace while ( *p && !isspace(*p) ) name += *p++; FV = Lookup( name ); if ( !FV ){ Warning( "Unknown FeatureValue '" + name + "' in file, (skipped) " ); continue; } else { FV->ValueClassProb = new SparseValueProbClass( Num ); size_t ui = 0; while ( *p && isspace( *p ) ){ while ( *p && isspace(*p) ) ++p; // skip trailing whitespace if ( *p ){ if ( ui == Num ){ FatalError( "Running out range: " + toString(ui) ); return false; } name = ""; while( *p && !isspace( *p ) ) name += *p++; double value = 0.0; if ( !stringTo( name, value ) ){ Error( "Found illegal value '" + name + "'" ); return false; } else if ( value > Epsilon ) { FV->ValueClassProb->Assign( ui, value ); } ui++; } } } } // check if we've got all the values, assign a default if not so it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ FV = (FeatureValue *)*it; if ( FV->ValueClassProb == NULL ){ FV->ValueClassProb = new SparseValueProbClass( Num ); } ++it; } vcpb_read = true; return true; } bool Feature::fill_matrix( istream &is ) { if ( !metric_matrix ) metric_matrix = new SparseSymetricMatrix(); else metric_matrix->Clear(); string line; while ( getline(is,line) ){ if ( line.empty() ) break; vector arr; double d; if ( TiCC::split_at( line, arr, " " ) != 2 ){ Error( "wrong line in inputfile" ); return false; } else if ( arr[0].length() < 2 ){ Error( "wrong line in inputfile" ); return false; } else if ( !stringTo( arr[1], d ) ) { Error( "wrong line in inputfile" ); return false; } else { string stripped = arr[0].substr(1,arr[0].length()-2); vector parts; if ( TiCC::split_at( stripped, parts, ",\t" ) != 2 ){ Error( "wrong line in inputfile" ); return false; } else { FeatureValue *F1 = Lookup(parts[0]); FeatureValue *F2 = Lookup(parts[1]); metric_matrix->Assign( F1, F2, d ); } } } PrestoreStatus = ps_read; return true; } void Feature::print_matrix( ostream &os, bool full ) const { // // Print the matrix. // int old_prec = os.precision(); os.setf( ios::scientific ); if ( full ){ VCarrtype::const_iterator it1 = ValuesArray.begin(); while ( it1 != ValuesArray.end() ){ FeatureValue *FV_i = (FeatureValue *)(*it1); os.width(6); os.setf(ios::left, ios::adjustfield); os << FV_i << ":"; os.width(12); os.precision(3); os.setf(ios::right, ios::adjustfield); VCarrtype::const_iterator it2 = ValuesArray.begin(); while ( it2 != ValuesArray.end() ){ FeatureValue *FV_j = (FeatureValue *)(*it2); os.width(12); os.precision(3); os.setf(ios::right, ios::adjustfield); if ( FV_i->ValFreq() < matrix_clip_freq || FV_j->ValFreq() < matrix_clip_freq ){ os << "*"; } else os << metric_matrix->Extract(FV_i,FV_j); ++it2; } os << endl; ++it1; } } else { os << *metric_matrix << endl; } os << setprecision( old_prec ); } TargetValue *Target::add_value( const string& valstr, int freq ){ unsigned int hash_val = TokenTree->Hash( valstr ); return add_value( hash_val, freq ); } TargetValue *Target::add_value( size_t index, int freq ){ IVCmaptype::const_iterator it = ValuesMap.find( index ); if( it == ValuesMap.end() ){ const string& name = TokenTree->ReverseLookup( index ); // we want to store the singleton value for this index // so we MUST reverse lookup the index TargetValue *tv = new TargetValue( name, index ); tv->ValFreq( freq ); ValuesMap[index] = tv; ValuesArray.push_back( tv ); } else { it->second->IncValFreq( freq ); } return (TargetValue *)ValuesMap[index]; } TargetValue *Target::MajorityClass() const { ValueClass *result = 0; size_t freq = 0; VCarrtype::const_iterator it = ValuesArray.begin(); while ( it != ValuesArray.end() ){ if ( (*it)->ValFreq() > freq ){ result = *it; freq = result->ValFreq(); } ++it; } return (TargetValue*)result; } bool Target::increment_value( TargetValue *TV ){ bool result = false; if ( TV ){ TV->incr_val_freq(); result = true; } return result; } bool Target::decrement_value( TargetValue *TV ){ bool result = false; if ( TV ){ TV->decr_val_freq(); result = true; } return result; } Instance::Instance(): TV(NULL), sample_weight(0.0), occ(1) { } Instance::~Instance(){ clear(); } void Instance::clear(){ for ( unsigned int i=0; i < FV.size(); ++i ){ if ( FV[i] ){ if ( FV[i]->isUnknown() ){ delete FV[i]; } } FV[i] = 0; } TV = 0; sample_weight = 0.0; occ = 1; } void Instance::Init( size_t len ){ FV.resize( len, 0 ); } ostream& operator<<(ostream& os, const Instance *I ){ if ( I ){ os << *I; } else os << " Empty Instance"; return os; } ostream& operator<<(ostream& os, const Instance& I ){ for ( unsigned int i=0; i < I.FV.size(); ++i ) os << I.FV[i] << ", "; os << I.TV << " " << I.sample_weight; return os; } void Instance::permute( const std::vector permutation ){ std::vector vals( FV.size(), 0 ); for ( size_t i=0; i < FV.size(); ++i ){ vals[i] = FV[permutation[i]]; } for ( size_t i=0; i < FV.size(); ++i ){ FV[i] = vals[i]; } } }