/*
$Id: MBLClass.cxx 15868 2013-04-02 14:05:58Z sloot $
$URL: https://ilk.uvt.nl/svn/trunk/sources/Timbl6/src/MBLClass.cxx $
Copyright (c) 1998 - 2013
ILK - Tilburg University
CLiPS - University of Antwerp
This file is part of timbl
timbl is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
timbl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, see .
For questions and suggestions, see:
http://ilk.uvt.nl/software.html
or send mail to:
timbl@uvt.nl
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "ticcutils/StringOps.h"
#include "ticcutils/TreeHash.h"
#include "timbl/MsgClass.h"
#include "timbl/Common.h"
#include "timbl/Types.h"
#include "timbl/Options.h"
#include "timbl/Instance.h"
#include "timbl/IBtree.h"
#include "timbl/BestArray.h"
#include "timbl/Testers.h"
#include "timbl/Metrics.h"
#include "timbl/Choppers.h"
#include "timbl/MBLClass.h"
using namespace std;
using namespace TiCC;
namespace Timbl {
void MBLClass::fill_table(){
if ( tableFilled )
return;
else
tableFilled = true;
//cerr << "fill table() for " << (void*)this << endl;
bool stat =
Options.Add( new IntegerOption( "FLENGTH",
&F_length, 0, 1, 32 ) )
&& Options.Add( new SizeOption( "MAXBESTS",
&MaxBests, 500, 10, 100000 ) )
&& Options.Add( new SizeOption( "TRIBL_OFFSET",
&tribl_offset, 0, 0, MaxFeatures ) )
&& Options.Add( new UnsignedOption( "IG_THRESHOLD",
&igThreshold,
1000, 0,
std::numeric_limits::max() ) )
&& Options.Add( new InputFormatOption( "INPUTFORMAT",
&input_format,
UnknownInputFormat ) )
&& Options.Add( new OrdeningOption( "TREE_ORDER",
&TreeOrder, UnknownOrdening ) )
&& Options.Add( new BoolOption( "ALL_WEIGHTS",
&need_all_weights, false ) )
&& Options.Add( new WeightOption( "WEIGHTING",
&Weighting, GR_w ) )
&& Options.Add( new IntegerOption( "BIN_SIZE",
&Bin_Size, 20, 2, 10000 ) )
&& Options.Add( new UnsignedOption( "IB2_OFFSET",
&ib2_offset, 0, 1, 10000000 ) )
&& Options.Add( new BoolOption( "KEEP_DISTRIBUTIONS",
&keep_distributions, false ) )
&& Options.Add( new BoolOption( "DO_SLOPPY_LOO",
&do_sloppy_loo, false ) )
&& Options.Add( new SizeOption( "TARGET_POS",
&target_pos,
std::numeric_limits::max(),
0, MaxFeatures ) );
if ( stat ){
Options.SetFreezeMark();
stat =
Options.Add( new BoolOption( "DO_SILLY",
&do_silly_testing, false ) )
&& Options.Add( new BoolOption( "DO_DIVERSIFY",
&do_diversify, false ) )
&& Options.Add( new DecayOption( "DECAY",
&decay_flag, Zero ) )
&& Options.Add( new IntegerOption( "SEED",
&random_seed, -1, -1, RAND_MAX ) )
&& Options.Add( new IntegerOption( "BEAM_SIZE",
&beamSize, 0, 1, INT_MAX ) )
&& Options.Add( new RealOption( "DECAYPARAM_A",
&decay_alfa, 1.0, 0.0, DBL_MAX ) )
&& Options.Add( new RealOption( "DECAYPARAM_B",
&decay_beta, 1.0, 0.0, DBL_MAX ) )
&& Options.Add( new NormalisationOption( "NORMALISATION",
&normalisation, noNorm ) )
&& Options.Add( new RealOption( "NORM_FACTOR",
&norm_factor, 1.0, Epsilon, DBL_MAX ) )
&& Options.Add( new BoolOption( "EXEMPLAR_WEIGHTS",
&do_sample_weighting, false ) )
&& Options.Add( new BoolOption( "IGNORE_EXEMPLAR_WEIGHTS",
&do_ignore_samples, true ) )
&& Options.Add( new BoolOption( "NO_EXEMPLAR_WEIGHTS_TEST",
&no_samples_test, true ) )
&& Options.Add( new VerbosityOption( "VERBOSITY",
&verbosity, NO_VERB ) )
&& Options.Add( new BoolOption( "EXACT_MATCH",
&do_exact_match, false ) )
&& Options.Add( new BoolOption( "HASHED_TREE",
&hashed_trees, true ) )
&& Options.Add( new MetricOption( "GLOBAL_METRIC",
&globalMetricOption, Overlap ) )
&& Options.Add( new MetricArrayOption( "METRICS",
UserOptions, globalMetricOption,
MaxFeatures+1 ) )
&& Options.Add( new IntegerOption( "MVD_LIMIT",
&mvd_threshold, 1, 1, 100000 ) )
&& Options.Add( new SizeOption( "NEIGHBORS",
&num_of_neighbors, 1, 1, 100000 ) )
&& Options.Add( new IntegerOption( "PROGRESS",
&progress, 10000, 1, INT_MAX ) )
&& Options.Add( new IntegerOption( "HANDLE_OCCURRENCES",
&doOcc, 0, 0, 3 ) )
&& Options.Add( new IntegerOption( "CLIP_FACTOR",
&clip_factor, 10, 0, 1000000 ) );
}
if ( !stat ){
FatalError( "Too many options for OptionTable" );
}
}
void MBLClass::InvalidMessage(void) const{
if ( err_count++ == 1 )
Warning( "A preceding error prevents any operation on this "
"Timbl Object\n"
"other experiments might not be influenced" );
else
Warning( "This Experiment is invalid due to errors" );
}
bool MBLClass::SetOption( const string& line ){
bool result = false;
if ( !ExpInvalid() ){
// Info( "set Option:" + line );
enum SetOptRes opt_res = Options.SetOption( line );
switch ( opt_res ){
case Opt_OK: // OK
MBL_init = false; // To assure redoing initializing stuff
result = true;
break;
case Opt_Frozen:
Warning( "SetOption '" + line + "' ignored.\nThis option may not "
"be changed after an InstanceBase is already created" );
break;
case Opt_Unknown:
Warning( "SetOption '" + line + "' failed.\nOption unknown" );
break;
case Opt_Ill_Val:
Error( "SetOption '" + line +
"' failed.\nIllegal value for this option" );
break;
}
}
return result;
}
void MBLClass::InitClass( const size_t Size ){
GlobalMetric = 0;
is_copy = false;
is_synced = false;
sock_os = 0;
Targets = NULL;
err_count = 0;
MBL_init = false;
tableFilled = false;
need_all_weights = false;
InstanceBase = NULL;
TargetStrings = NULL;
FeatureStrings = NULL;
num_of_features = 0;
target_pos = std::numeric_limits::max();
mvd_threshold = 1;
effective_feats = 0;
num_of_num_features = 0;
DBEntropy = -1.0;
ChopInput = 0;
MaxFeatures = Size;
runningPhase = LearnWords;
do_sloppy_loo = false;
do_silly_testing = false;
do_diversify = false;
keep_distributions = false;
UserOptions.resize(MaxFeatures+1);
tester = 0;
// cerr << "call fill table() in InitClass()" << endl;
fill_table();
decay = 0;
myerr = &cerr;
mylog = &cout;
}
MBLClass::MBLClass( const string& name ){
tableFilled = false;
exp_name = name;
}
MBLClass &MBLClass::operator=( const MBLClass& m ){
if ( this != &m ){
is_copy = true;
is_synced = false;
MaxFeatures = m.MaxFeatures;
UserOptions.resize(MaxFeatures+1);
// cerr << "call fill table() in assign" << endl;
fill_table();
F_length = m.F_length;
MaxBests = m.MaxBests;
TreeOrder = m.TreeOrder;
decay_flag = m.decay_flag;
input_format = m.input_format;
random_seed = m.random_seed;
beamSize = m.beamSize;
decay_alfa = m.decay_alfa;
decay_beta = m.decay_beta;
normalisation = m.normalisation;
norm_factor = m.norm_factor;
do_sample_weighting = m.do_sample_weighting;
do_ignore_samples = m.do_ignore_samples;
no_samples_test = m.no_samples_test;
keep_distributions = m.keep_distributions;
verbosity = m.verbosity;
do_exact_match = m.do_exact_match;
sock_os = 0;
globalMetricOption = m.globalMetricOption;
if ( m.GlobalMetric )
GlobalMetric = getMetricClass( m.GlobalMetric->type() );
UserOptions = m.UserOptions;
mvd_threshold = m.mvd_threshold;
num_of_neighbors = m.num_of_neighbors;
dynamic_neighbors = m.dynamic_neighbors;
num_of_features = m.num_of_features;
target_pos = m.target_pos;
progress = m.progress;
Bin_Size = m.Bin_Size;
tribl_offset = m.tribl_offset;
ib2_offset = m.ib2_offset;
clip_factor = m.clip_factor;
runningPhase = m.runningPhase;
Weighting = m.Weighting;
do_sloppy_loo = m.do_sloppy_loo;
do_silly_testing = m.do_silly_testing;
do_diversify = m.do_diversify;
permutation = m.permutation;
tester = 0;
decay = 0;
Features = m.Features;
PermFeatures = m.PermFeatures;
for ( unsigned int i=0; i < Features.size(); ++i ){
Features[i] = new Feature( *m.Features[i] );
if ( m.PermFeatures[i] )
PermFeatures[i] = Features[permutation[i]];
else
PermFeatures[i] = 0;
}
Targets = m.Targets;
err_count = 0;
MBL_init = false;
need_all_weights = false;
InstanceBase = m.InstanceBase->Copy();
TargetStrings = m.TargetStrings;
FeatureStrings = m.FeatureStrings;
effective_feats = m.effective_feats;
num_of_num_features = m.num_of_num_features;
DBEntropy = -1.0;
ChopInput = 0;
setInputFormat( m.input_format );
//one extra to store the target!
CurrInst.Init( num_of_features );
myerr = m.myerr;
mylog = m.mylog;
}
return *this;
}
MBLClass::~MBLClass(){
CurrInst.clear();
if ( !is_copy ){
delete InstanceBase;
delete Targets;
delete TargetStrings;
delete FeatureStrings;
}
else {
if ( is_synced ){
delete InstanceBase;
}
else {
InstanceBase->CleanPartition( false );
}
}
for ( unsigned int i=0; i < Features.size(); ++i ){
delete Features[i];
}
delete sock_os;
delete GlobalMetric;
delete tester;
delete decay;
delete ChopInput;
}
void MBLClass::Info( const string& out_line ) const {
#pragma omp critical
{
// Info NEVER to socket !
if ( exp_name != "" )
*mylog << "-" << exp_name << "-" << out_line << endl;
else
*mylog << out_line << endl;
}
}
void MBLClass::Warning( const string& out_line ) const {
#pragma omp critical
{
if ( sock_os )
*sock_os << "ERROR { " << out_line << " }" << endl;
else {
if ( exp_name != "" )
*myerr << "Warning:-" << exp_name << "-" << out_line << endl;
else
*myerr << "Warning: " << out_line << endl;
}
}
}
void MBLClass::Error( const string& out_line ) const {
if ( sock_os )
*sock_os << "ERROR { " << out_line << " }" << endl;
else {
if ( exp_name != "" )
*myerr << "Error:-" << exp_name << "-" << out_line << endl;
else
*myerr << "Error: " << out_line << endl;
}
err_count++;
}
void MBLClass::FatalError( const string& out_line ) const {
if ( sock_os )
*sock_os << "ERROR { " << out_line << " }" << endl;
else {
if ( exp_name != "" )
*myerr << "-" << exp_name << "-";
if ( exp_name != "" )
*myerr << "FatalError:-" << exp_name << "-" << out_line << endl;
else
*myerr << "FatalError: " << out_line << endl;
throw( runtime_error("Stopped") );
}
}
bool MBLClass::ShowOptions( ostream& os ) const {
os << "Possible Experiment Settings (current value between []):" << endl;
Options.Show_Options( os );
os << endl;
return true;
}
bool MBLClass::ShowSettings( ostream& os ) const{
os << "Current Experiment Settings :" << endl;
Options.Show_Settings( os );
os << endl;
return true;
}
bool MBLClass::connectToSocket( ostream *ss ){
if ( sock_os ){
throw( logic_error( "connectToSocket:: already connected!" ) );
}
else {
sock_os = ss;
if ( sock_os && sock_os->good() ){
return true;
}
else
FatalError( "connecting streams to socket failed" );
}
return false;
}
xmlNode *MBLClass::settingsToXml() const{
ostringstream tmp;
Options.Show_Settings( tmp );
vector lines;
int num = TiCC::split_at( tmp.str(), lines, "\n" );
xmlNode *result = XmlNewNode("settings");
for ( int i=0; i < num; ++i ){
vector parts;
if ( TiCC::split_at( lines[i], parts, ":" ) ==2 ){
string tag = TiCC::trim( parts[0] );
string val = TiCC::trim( parts[1] );
XmlNewTextChild( result, tag, val );
}
}
return result;
}
bool MBLClass::ShowWeights( ostream &os ) const {
if ( ExpInvalid() )
return false;
else {
int OldPrec = os.precision(DBL_DIG);
for ( size_t i=0; i< num_of_features; ++i ){
os.precision(DBL_DIG);
os << "Feature " << i+1 << "\t : "
<< Features[i]->Weight() << endl;
}
os.precision(OldPrec);
}
return true;
}
void MBLClass::calculatePermutation( const vector& W ){
vector WR = W;
size_t IgnoredFeatures = 0;
permutation.resize(num_of_features);
for ( size_t j=0; j < num_of_features; ++j ){
permutation[j] = j;
if ( Features[j]->Ignore() ){
WR[j] = -0.1; // To be shure that they are placed AFTER
// those which are realy Zero
IgnoredFeatures++;
}
}
if ( IgnoredFeatures == num_of_features ){
Error( "All features seem to be ignored! Nothing to do" );
}
else {
for ( size_t k=0; k < num_of_features; ++k ){
size_t Max = 0;
for ( size_t m=1; m < num_of_features; ++m ){
if ( WR[m] > WR[Max] )
Max = m;
}
WR[Max] = -1;
permutation[k] = Max;
}
}
}
void MBLClass::writePermutation( ostream& os ) const {
os << "Feature Permutation based on "
<< ( Weighting==UserDefined_w?"weightfile":toString(TreeOrder, true))
<< " :" << endl << "< ";
for ( size_t j=0; j < num_of_features-1; ++j ){
os << permutation[j]+1 << ", ";
}
os << permutation[num_of_features-1]+1 << " >" << endl;
}
inline char *CurTime(){
time_t lTime;
struct tm *curtime;
char *time_string;
time(&lTime);
curtime = localtime(&lTime);
time_string = asctime(curtime);
time_string[24] = '\0'; // defeat the newline!
return time_string;
}
void MBLClass::time_stamp( const char *line, int number ) const {
if ( !Verbosity(SILENT) ){
ostringstream ostr;
ostr << line;
if ( number > -1 ){
ostr.width(6);
ostr.setf(ios::right, ios::adjustfield);
ostr << number << " @ ";
}
else
ostr << " ";
ostr << CurTime();
Info( ostr.str() );
}
}
void MBLClass::InitWeights(void){
for ( size_t i=0; i< num_of_features; ++i ){
if ( Features[i]->Ignore() )
Features[i]->SetWeight( 0.0 );
else
switch ( Weighting ){
case IG_w:
Features[i]->SetWeight( Features[i]->InfoGain() );
break;
case GR_w:
Features[i]->SetWeight( Features[i]->GainRatio() );
break;
case X2_w:
Features[i]->SetWeight( Features[i]->ChiSquare() );
break;
case SV_w:
Features[i]->SetWeight( Features[i]->SharedVariance() );
break;
case SD_w:
Features[i]->SetWeight( Features[i]->StandardDeviation() );
break;
case UserDefined_w:
break;
case No_w:
Features[i]->SetWeight( 1.0 );
break;
case Unknown_w:
case Max_w:
FatalError( "InitWeights: Invalid Weight in switch: " +
toString( Weighting ) );
break;
}
}
}
void MBLClass::diverseWeights(void){
double minW = DBL_MAX;
for ( size_t i=0; i< num_of_features; ++i ){
if ( Features[i]->Ignore() )
continue;
if ( Features[i]->Weight() < minW ){
minW = Features[i]->Weight();
}
}
for ( size_t i=0; i< num_of_features; ++i ){
if ( Features[i]->Ignore() )
continue;
Features[i]->SetWeight( (Features[i]->Weight() - minW ) + Epsilon );
}
}
void MBLClass::default_order(){
if ( TreeOrder == UnknownOrdening )
switch ( Weighting ){
case GR_w:
TreeOrder = GROrder;
break;
case IG_w:
TreeOrder = IGOrder;
break;
case X2_w:
TreeOrder = X2Order;
break;
case SV_w:
TreeOrder = SVOrder;
break;
case SD_w:
TreeOrder = SDOrder;
break;
case No_w:
TreeOrder = NoOrder;
break;
case UserDefined_w:
TreeOrder = GROrder;
break;
default:
FatalError( "Illegal Weighting Value in Switch: " +
toString( Weighting ) );
break;
}
}
void MBLClass::set_order(){
calculate_fv_entropy(false);
vector Order(num_of_features);
for ( size_t i=0; i < num_of_features; ++i )
switch( TreeOrder ){
case DataFile:
Order[i] = Features[i]->Weight();
break;
case NoOrder:
Order[i] = (double)(num_of_features-i);
break;
case IGOrder:
Order[i] = Features[i]->InfoGain();
break;
case GROrder:
Order[i] = Features[i]->GainRatio();
break;
case IGEntropyOrder:
Order[i] = Features[i]->InfoGain() * Features[i]->SplitInfo();
break;
case GREntropyOrder:
Order[i] = Features[i]->GainRatio() * Features[i]->SplitInfo();
break;
case X2Order:
Order[i] = Features[i]->ChiSquare();
break;
case SVOrder:
Order[i] = Features[i]->SharedVariance();
break;
case SDOrder:
Order[i] = Features[i]->StandardDeviation();
break;
case OneoverFeature:
Order[i] = 1.0 / Features[i]->ValuesArray.size();
break;
case GRoverFeature:
Order[i] = Features[i]->GainRatio() / Features[i]->ValuesArray.size();
break;
case IGoverFeature:
Order[i] = Features[i]->InfoGain() / Features[i]->ValuesArray.size();
break;
case X2overFeature:
Order[i] = Features[i]->ChiSquare() / Features[i]->ValuesArray.size();
break;
case SVoverFeature:
Order[i] = Features[i]->SharedVariance() / Features[i]->ValuesArray.size();
break;
case SDoverFeature:
Order[i] = Features[i]->StandardDeviation() / Features[i]->ValuesArray.size();
break;
case OneoverSplitInfo:
Order[i] = 1.0 / Features[i]->SplitInfo();
break;
case UnknownOrdening:
case MaxOrdening:
FatalError( "Setorder: Illegal Order Value in Switch: " +
toString( TreeOrder ) );
break;
}
calculatePermutation( Order );
if ( !Verbosity(SILENT) )
writePermutation( *mylog );
for ( size_t j=0; j < num_of_features; ++j ){
if ( j < effective_feats )
PermFeatures[j] = Features[permutation[j]];
else
PermFeatures[j] = NULL;
}
}
void MBLClass::MatrixInfo( ostream& os ) const {
unsigned int TotalCount = 0;
bool dummy;
for ( size_t f = 0; f < num_of_features; ++f ){
if ( !Features[f]->Ignore() &&
Features[f]->isStorableMetric() &&
Features[f]->matrixPresent( dummy ) ){
unsigned int Count = Features[f]->matrix_byte_size();
os << "Size of value-matrix[" << f+1 << "] = "
<< Count << " Bytes " << endl;
TotalCount += Count;
}
}
if ( TotalCount )
os << "Total Size of value-matrices " << TotalCount << " Bytes "
<< endl << endl;
}
bool MBLClass::readMatrices( istream& is ){
string line;
bool skip = false;
bool anything = false;
while ( getline( is, line ) ){
line = TiCC::trim( line );
if ( line.empty() )
continue;
if ( line.find( "Feature" ) != 0 ){
if ( skip )
continue;
else
return false;
}
else {
skip = false;
line = line.substr( 8 );
string::size_type pos = line.find_first_not_of("0123456789");
string nums = line.substr( 0, pos );
size_t num;
if ( !stringTo( nums, num ) ){
FatalError( "no feature index found in the inputfile" );
}
else {
if ( pos == string::npos )
line = "";
else {
line = TiCC::trim( line.substr( pos ) );
}
if ( line.empty() ){
if ( !Features[num-1]->isStorableMetric() ){
Warning( "Ignoring entry for feature " + nums
+ " which is NOT set to a storable metric type."
+ " use -m commandline option to set metrics" );
skip = true;
}
else if ( !Features[num-1]->fill_matrix( is ) )
return false;
else {
Info( "read ValueMatrix for feature " + nums );
anything = true;
}
}
}
}
}
if ( !anything ){
Error( "NO metric values found" );
return false;
}
return true;
}
bool MBLClass::writeMatrices( ostream& os ) const {
for ( size_t i = 0; i < num_of_features; ++i ){
os << "Feature " << i+1;
bool dummy;
if ( !Features[i]->matrixPresent( dummy ) ){
os << " not available.\n" << endl;
}
else {
os << endl;
Features[i]->print_matrix( os );
}
}
return os.good();
}
bool MBLClass::readArrays( istream& is ){
bool result = true;
size_t num;
size_t index = 1;
string buf;
char kar;
do {
is >> ws >> buf;
if ( compare_nocase_n( "feature", buf ) ){
is >> ws >> kar; // skip #
if ( kar != '#' ){
Error( "Input out-of-sync, a '#' was expected" );
result = false;
}
else {
is >> num;
if ( num != index ){
Error( "Wrong feature number " + toString(num) +
" in file, " + toString(index) + " expected" );
result = false;
}
else if ( index > num_of_features ){
Error( "Too many features matrices in this file " );
result = false;
}
else {
is >> ws >> buf;
if ( compare_nocase_n( "Ignored", buf ) ){
if ( Features[index-1]->Ignore() ){
++index;
continue;
}
else {
Error( "Feature #" + toString(index) +
" may not be ignored...");
result = false;
}
}
else if ( compare_nocase_n( "Numeric", buf ) ){
if ( Features[index-1]->isNumerical() ){
++index;
continue;
}
else {
Error( "Feature #" + toString(index) + " is not Numeric..." );
result = false;
}
}
else if ( !compare_nocase_n( "Matrix", buf ) ){
Error( "Problem in Probability file, missing matrix info" );
result = false;
}
else if ( Features[index-1]->Ignore() ||
Features[index-1]->isNumerical() ){
Warning( "Matrix info found for feature #"
+ toString(index)
+ " (skipped)" );
++index;
}
else {
is.ignore( std::numeric_limits::max(), '\n' );
result = Features[index-1]->read_vc_pb_array( is );
++index;
}
}
}
}
}
while ( result && !is.eof() & !is.bad() );
if ( index < num_of_features+1 ){
Error( "Not enough features matrices in this file " );
result = false;
}
return result;
}
bool MBLClass::writeArrays( ostream& os ) {
if ( ExpInvalid() )
return false;
else if ( !initProbabilityArrays( false ) ){
Warning( "couldn't Calculate probability Arrays's" );
return false;
}
else {
// Print the possible classes.
//
os << "Targets : ";
VCarrtype::const_iterator it = Targets->ValuesArray.begin();
while ( it != Targets->ValuesArray.end() ){
os << (TargetValue *)*it;
++it;
if ( it != Targets->ValuesArray.end() )
os << ",";
}
os << "." << endl << endl;
for ( size_t i = 0; i < num_of_features; ++i )
if ( Features[i]->Ignore() )
os << "feature # " << i+1 << " Ignored, (-s option)" << endl;
else if (Features[i]->isNumerical() )
os << "feature # " << i+1 << " Numeric, (-N option)" << endl;
else {
os << "feature # " << i+1 << " Matrix: " << endl;
Features[i]->print_vc_pb_array( os );
os << endl;
}
return true;
}
}
bool MBLClass::allocate_arrays(){
size_t Dim = Targets->ValuesArray.size();
bool result = true;
for ( size_t j = 0; result && j < num_of_features; ++j ) {
if ( !Features[j]->Ignore() &&
!Features[j]->isNumerical() ) {
result = Features[j]->AllocSparseArrays( Dim );
}
} // j
return true;
}
bool MBLClass::initProbabilityArrays( bool force ){
bool result = true;
if ( !is_copy ){
result = allocate_arrays();
if ( result ){
for ( size_t j = 0; j < num_of_features; ++j ) {
if ( !Features[j]->Ignore() &&
!Features[j]->isNumerical() ){
Features[j]->ClipFreq( (int)rint(clip_factor *
log((double)Features[j]->EffectiveValues())));
if ( !Features[j]->ArrayRead() &&
( force ||
Features[j]->isStorableMetric() ) ){
Features[j]->InitSparseArrays();
}
}
} // j
}
}
return result;
}
/*
For mvd metric.
*/
void MBLClass::calculatePrestored(){
if ( !is_copy ){
for ( size_t j = tribl_offset; j < effective_feats; ++j ) {
if ( !PermFeatures[j]->Ignore() &&
PermFeatures[j]->isStorableMetric() ){
PermFeatures[j]->store_matrix( mvd_threshold );
}
}
if ( Verbosity(VD_MATRIX) )
for ( size_t i = 0; i < num_of_features; ++i )
if ( !Features[i]->Ignore() ){
bool dummy;
if (Features[i]->matrixPresent( dummy ) ){
*mylog << "Value matrix of feature # "
<< i+1 << endl;
Features[i]->print_matrix( *mylog, true );
*mylog << endl;
}
else {
*mylog << "Value Difference matrix of feature # "
<< i+1 << endl << "Not available." << endl;
}
}
}
}
const Instance *MBLClass::chopped_to_instance( PhaseValue phase ){
CurrInst.clear();
if ( num_of_features != target_pos ) {
ChopInput->swapTarget( target_pos );
}
int occ = ChopInput->getOcc();
if ( occ > 1 ){
CurrInst.Occurrences( occ );
}
switch ( phase ){
case LearnWords:
// Add the target.
CurrInst.TV = Targets->add_value( ChopInput->getField( num_of_features ),
occ );
// Now add the Feature values.
for ( size_t i = 0; i < num_of_features; ++i ){
// when learning, no need to bother about Permutation
if ( Features[i]->Ignore() ) // but this might happen, take care!
CurrInst.FV[i] = NULL;
else {
// Add it to the Instance.
CurrInst.FV[i] = Features[i]->add_value( ChopInput->getField(i),
CurrInst.TV, occ );
}
} // i
break;
case TrainWords:
// Lookup for TreeBuilding
// First the Features
for ( size_t k = 0; k < effective_feats; ++k ){
size_t j = permutation[k];
CurrInst.FV[k] = Features[j]->Lookup( ChopInput->getField(j) );
} // k
// and the Target
CurrInst.TV = Targets->Lookup( ChopInput->getField( num_of_features ) );
break;
case TrainLearnWords:
// Lookup for Incremental TreeBuilding
// Assumes that somehow Permutation and effective_feats are known
// First the Target
CurrInst.TV = Targets->add_value( ChopInput->getField(num_of_features ),
occ );
// Then the Features
for ( size_t l = 0; l < effective_feats; ++l ){
size_t j = permutation[l];
CurrInst.FV[l] = Features[j]->add_value( ChopInput->getField(j),
CurrInst.TV, occ );
} // for l
break;
case TestWords:
// Lookup for Testing
// This might fail for unknown values, then we create a dummy value
for ( size_t m = 0; m < effective_feats; ++m ){
size_t j = permutation[m];
const string& fld = ChopInput->getField(j);
CurrInst.FV[m] = Features[j]->Lookup( fld );
if ( !CurrInst.FV[m] ){
// for "unknown" values have to add a dummy value
CurrInst.FV[m] = new FeatureValue( fld );
}
} // i
// the last string is the target
CurrInst.TV = Targets->Lookup( ChopInput->getField(num_of_features) );
break;
default:
FatalError( "Wrong value in Switch: "
+ toString(phase) );
}
if ( ( phase != TestWords ) && doSamples() ){
double exW = ChopInput->getExW();
if ( exW < 0 )
exW = 1.0;
CurrInst.ExemplarWeight( exW );
}
return &CurrInst;
}
bool empty_line( const string& Line, const InputFormatType IF ){
// determine wether Line is empty or a commentline
bool result = ( Line.empty() ||
( IF == ARFF && // ARFF "comment"
( Line[0] == '%' || Line[0] == '@' ) ) ||
( Line.find_first_not_of( " \t" ) == string::npos ) );
return result;
}
string MBLClass::get_org_input( ) const {
return ChopInput->getString();
}
void MBLClass::LearningInfo( ostream& os ) {
if ( !ExpInvalid() && !Verbosity(SILENT) ){
calculate_fv_entropy( !MBL_init );
os.setf(ios::showpoint );
int OldPrec = os.precision(8);
os << "DB Entropy : " << DBEntropy << endl;
os << "Number of Classes : " << Targets->EffectiveValues() << endl;
os << endl;
if ( Verbosity(FEAT_W) ){
if ( CurrentWeighting() == SD_w ){
os << "Feats\tVals\tStandard Deviation" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os << setw(5) << i+1;
os.setf(ios::right, ios::adjustfield);
if ( Features[i]->Ignore() ){
os << " (ignored) " << endl;
}
else {
os.setf(ios::right, ios::adjustfield);
os << setw(7) << Features[i]->EffectiveValues()
<< "\t" << Features[i]->StandardDeviation();
if ( Features[i]->isNumerical() )
os << " NUMERIC";
os << endl;
}
}
os << endl;
os.precision(OldPrec);
}
else if ( need_all_weights ){
os << "Feats\tVals\tX-square\tVariance\tInfoGain\tGainRatio" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os << setw(5) << i+1;
os.setf(ios::right, ios::adjustfield);
if ( Features[i]->Ignore() ){
os << " (ignored) " << endl;
}
else {
os.setf(ios::right, ios::adjustfield);
os << setw(7) << Features[i]->EffectiveValues()
<< "\t" << Features[i]->ChiSquare()
<< "\t" << Features[i]->SharedVariance()
<< "\t" << Features[i]->InfoGain()
<< "\t" << Features[i]->GainRatio();
if ( Features[i]->isNumerical() )
os << " NUMERIC";
os << endl;
}
}
os << endl;
os.precision(OldPrec);
}
else {
os << "Feats\tVals\tInfoGain\tGainRatio" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os << setw(5) << i+1;
os.setf(ios::right, ios::adjustfield);
if ( Features[i]->Ignore() ){
os << " (ignored) " << endl;
}
else {
os.setf(ios::right, ios::adjustfield);
os << setw(7) << Features[i]->EffectiveValues()
<< "\t" << Features[i]->InfoGain()
<< "\t" << Features[i]->GainRatio();
if ( Features[i]->isNumerical() )
os << " NUMERIC";
os << endl;
}
}
os << endl;
os.precision(OldPrec);
}
}
}
}
bool MBLClass::writeWeights( ostream& os ) const {
bool result = false;
if ( !ExpInvalid() ){
if ( Features[0] == NULL ){
Warning( "unable to save Weights, nothing learned yet" );
}
else {
os << "# DB Entropy: " << DBEntropy << endl;
os << "# Classes: " << Targets->ValuesArray.size() << endl;
os << "# Lines of data: " << Targets->TotalValues() << endl;
int OldPrec = os.precision(DBL_DIG);
if ( CurrentWeighting() == SD_w ){
os << "#" << endl;
os << "# " << toString( SD_w ) << endl;
os << "# Fea." << "\t" << "Weight" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os.precision(DBL_DIG);
if ( Features[i]->Ignore() )
os << i+1 << "\t" << "Ignore" << endl;
else
os << i+1 << "\t" << Features[i]->StandardDeviation() << endl;
}
os << "#" << endl;
}
else {
os << "# " << toString( No_w ) << endl;
os << "# Fea." << "\t" << "Weight" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os.precision(DBL_DIG);
if ( Features[i]->Ignore() )
os << i+1 << "\t" << "Ignore" << endl;
else
os << i+1 << "\t" << 1.0 << endl;
}
os << "#" << endl;
os << "# " << toString( GR_w ) << endl;
os << "# Fea." << "\t" << "Weight" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os.precision(DBL_DIG);
if ( Features[i]->Ignore() )
os << i+1 << "\t" << "Ignore" << endl;
else
os << i+1 << "\t" << Features[i]->GainRatio() << endl;
}
os << "#" << endl;
os << "# " << toString( IG_w ) << endl;
os << "# Fea." << "\t" << "Weight" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os.precision(DBL_DIG);
if ( Features[i]->Ignore() )
os << i+1 << "\t" << "Ignore" << endl;
else
os << i+1 << "\t" << Features[i]->InfoGain() << endl;
}
if ( need_all_weights ){
os << "#" << endl;
os << "# " << toString( SV_w ) << endl;
os << "# Fea." << "\t" << "Weight" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os.precision(DBL_DIG);
if ( Features[i]->Ignore() )
os << i+1 << "\t" << "Ignore" << endl;
else
os << i+1 << "\t" << Features[i]->SharedVariance() << endl;
}
os << "#" << endl;
os << "# " << toString( X2_w ) << endl;
os << "# Fea." << "\t" << "Weight" << endl;
for ( size_t i = 0; i < num_of_features; ++i ) {
os.precision(DBL_DIG);
if ( Features[i]->Ignore() )
os << i+1 << "\t" << "Ignore" << endl;
else
os << i+1 << "\t" << Features[i]->ChiSquare() << endl;
}
os << "#" << endl;
}
}
os.precision(OldPrec);
result = true;
}
}
return result;
}
bool MBLClass::read_the_vals( istream& is ){
bool result = true;
bool *done = new bool[num_of_features];
for ( size_t i=0; i < num_of_features; ++i )
done[i] = false;
string Buffer;
while ( getline( is, Buffer) ){
if ( !Buffer.empty() ){
if ( Buffer[0] == '#'){
break;
}
// Line looks like:
// 28 0.445481
// or:
// 13 Ignore
//
vector vals;
if ( TiCC::split( Buffer, vals ) == 2 ){
size_t i_f = stringTo( vals[0] );
if ( i_f > num_of_features ){
Error( "in weightsfile, Feature index > Maximum, (" +
toString(num_of_features) + ")" );
}
else if ( done[i_f-1] ){
Error( "in weightsfile, Feature index " + vals[0] +
" is mentioned twice" );
}
else {
done[i_f-1] = true;
if ( !compare_nocase( vals[1], "Ignore" ) ){
double w;
if ( !stringTo( vals[1], w ) ){
Error( "in weightsfile, Feature " + vals[0] +
" has illegal value: " + vals[1] );
}
else {
Features[i_f-1]->SetWeight( w );
if ( Features[i_f-1]->Ignore() )
Warning( "in weightsfile, "
"Feature " + vals[0] + " has value: " +
toString( w ) +
" assigned, but will be ignored" );
}
}
else {
Features[i_f-1]->SetWeight( 0.0 );
if ( !Features[i_f-1]->Ignore() )
Warning( "in weightsfile, Feature " + vals[0] +
" has value: 'Ignore', we will use: 0.0 " );
}
}
}
}
}
if ( result ){
for ( size_t j=0; j < num_of_features; ++j )
if ( !done[j] ) {
Error( "in weightsfile, Feature index " + toString(j+1) +
" is not mentioned" );
result = false;
}
}
delete [] done;
return result;
}
bool MBLClass::readWeights( istream& is, WeightType wanted ){
set ret_weights;
bool result = false;
bool old_style = true;
if ( !ExpInvalid() ){
string Buffer;
while( getline( is, Buffer ) ) {
// A comment starts with '#'
//
if ( Buffer.empty() )
continue;
else {
if ( Buffer[0] == '#'){
vector vals;
if ( TiCC::split_at( Buffer, vals, " " ) == 2 ){
WeightType tmp_w = Unknown_w;
if ( !stringTo( vals[1], tmp_w ) )
continue;
else {
old_style = false;
if ( tmp_w == wanted ){
getline( is, Buffer );
result = read_the_vals( is );
break;
}
}
}
}
}
}
if ( is.eof() ){
if ( old_style ){
// wanted weighting not found
// Old style weightsfile?
// Warning( "Old Style weightsfile. Please update" );
is.clear();
is.seekg(0);
size_t pos = 0;
while( getline( is, Buffer ) ) {
// A comment starts with '#'
//
if ( Buffer.empty() ){
pos = is.tellg();
continue;
}
else {
if ( Buffer[0] == '#'){
pos = is.tellg();
continue;
}
is.seekg(pos);
result = read_the_vals( is );
break;
}
}
}
}
if ( !result ){
Warning( "Unable to retrieve "
+ toString( wanted ) + " Weights" );
Warning( "unable to continue" );
return false;
}
// make shure all weights are correct
// Paranoid?
for ( size_t i=0; i< num_of_features; ++i ){
Features[i]->InfoGain( Features[i]->Weight() );
Features[i]->GainRatio( Features[i]->Weight() );
Features[i]->ChiSquare( Features[i]->Weight() );
Features[i]->SharedVariance( Features[i]->Weight() );
Features[i]->StandardDeviation( 0.0 );
}
Weighting = UserDefined_w;
}
return true;
}
void MBLClass::calculate_fv_entropy( bool always ){
bool realy_first = DBEntropy < 0.0;
if ( always || realy_first ){
// if it's the first time (DBEntropy == 0 ) or
// if always, we have to (re)calculate everything
double Entropy = 0.0, Ratio;
// first get the Database Entropy
size_t totval = Targets->TotalValues();
VCarrtype::const_iterator it = Targets->ValuesArray.begin();
while ( it != Targets->ValuesArray.end() ){
Ratio = (*it)->ValFreq() /
(double)totval;
if ( Ratio > 0 )
Entropy += Ratio * Log2(Ratio);
++it;
}
DBEntropy = fabs(-Entropy);
allocate_arrays(); // create ValueClassProb arrays..
}
// Loop over the Features, see if the numerics are non-singular
// and do the statistics for those features where the metric is changed.
FeatVal_Stat *feat_status = new FeatVal_Stat[num_of_features];
bool nothing_changed = true;
for ( size_t g = 0; g < num_of_features; ++g ) {
feat_status[g] = Unknown;
if ( Features[g]->Ignore() )
continue;
bool metricChanged = false;
MetricType TmpMetricType = UserOptions[g+1];
metricClass *tmpMetric = getMetricClass( TmpMetricType );
if ( tmpMetric->isNumerical() ){
feat_status[g] = Features[g]->prepare_numeric_stats();
if ( feat_status[g] == SingletonNumeric &&
input_format == SparseBin &&
GlobalMetric->isSimilarityMetric( ) ){
// ok
}
else if ( feat_status[g] != NumericValue ){
if ( GlobalMetric->isNumerical() ){
TmpMetricType = Overlap;
}
else {
TmpMetricType = globalMetricOption;
}
}
}
else if ( Features[g]->ValuesArray.size() == 1 )
feat_status[g] = Singleton;
delete tmpMetric;
if ( always || realy_first ){
bool isRead;
if ( Features[g]->metric &&
Features[g]->getMetricType() != TmpMetricType &&
Features[g]->isStorableMetric() &&
Features[g]->matrixPresent( isRead ) &&
isRead ){
Error( "The metric " + toString(Features[g]->getMetricType()) +
" for feature " + toString( g+1 ) +
" is set from a file. It cannot be changed!" );
return;
}
metricChanged = !Features[g]->setMetricType(TmpMetricType);
}
if ( metricChanged )
nothing_changed = false;
} // end g
if ( ( CurrentWeighting() == SD_w ||
GlobalMetric->isSimilarityMetric() )
&& !nothing_changed ){
// check to see if ALL features are still Numeric.
// otherwise we can't do Standard Deviation weighting,
// or Similarity Metrics!
bool first = true;
ostringstream ostr1;
ostringstream ostr2;
for ( size_t ff = 0; ff < num_of_features; ++ff )
if ( feat_status[ff] == NotNumeric ){
if ( first ){
ostr1 << "The following feature(s) have non numeric value: ";
first = false;
}
else
ostr1 << ", ";
size_t n = ff;
while ( ff < num_of_features-1 &&
feat_status[ff+1] == NotNumeric )
ff++;
if ( n != ff ){
ostr1 << n+1 << "-" << ff+1;
}
else
ostr1 << ff+1;
}
if ( !first ){
Error( ostr1.str() );
if ( GlobalMetric->isSimilarityMetric() )
Error( "Therefore InnerProduct/Cosine operations are impossible" );
else
Error( "Therefore " + toString(CurrentWeighting()) + " weighting is impossible" );
return;
}
}
// Give a warning for singular features, except when it's
// a result of a forced recalculation
if ( realy_first ){
bool first = true;
ostringstream ostr1;
ostringstream ostr2;
for ( size_t ff = 0; ff < num_of_features; ++ff ) {
if ( feat_status[ff] == Singleton ||
feat_status[ff] == SingletonNumeric ){
if ( first ){
ostr1 << "The following feature(s) have only 1 value: ";
first = false;
}
else
ostr1 << ", ";
size_t n = ff;
while ( ff < num_of_features-1 &&
( feat_status[ff+1] == Singleton ||
feat_status[ff+1] == SingletonNumeric ) )
ff++;
if ( n != ff ){
ostr1 << n+1 << "-" << ff+1;
}
else
ostr1 << ff+1;
}
}
if ( !first && !is_copy ){
Warning( ostr1.str() );
}
first = true;
for ( size_t ff = 0; ff < num_of_features; ++ff )
if ( feat_status[ff] == NotNumeric ){
if ( first ){
ostr2 << "The following feature(s) contained non-numeric values and\nwill be treated as NON-Numeric: ";
first = false;
}
else
ostr2 << ", ";
size_t n = ff;
while ( ff < num_of_features-1 &&
feat_status[ff+1] == NotNumeric ) ff++;
if ( n != ff ){
ostr2 << n+1 << "-" << ff+1;
}
else
ostr2 << ff+1;
}
if ( !first ){
Warning( ostr2.str() );
}
}
if ( always || realy_first ){
for ( size_t i = 0; i < num_of_features; ++i ) {
if ( Weighting != UserDefined_w ){
if ( CurrentWeighting() == SD_w )
Features[i]->StandardDeviationStatistics( );
else if ( Features[i]->isNumerical() ){
Features[i]->NumStatistics( DBEntropy, Targets, Bin_Size,
need_all_weights );
}
else {
Features[i]->Statistics( DBEntropy, Targets,
need_all_weights );
}
}
}
}
delete [] feat_status;
}
bool MBLClass::writeNamesFile( ostream& os ) const {
bool result = true;
if ( ExpInvalid() ){
result = false;
}
else {
// Print the possible classes.
//
VCarrtype::const_iterator it = Targets->ValuesArray.begin();
while ( it != Targets->ValuesArray.end() ){
os << (TargetValue *)*it;
++it;
if ( it != Targets->ValuesArray.end() )
os << ",";
}
os << "." << endl << endl;
// Loop over the Features.
//
for ( size_t f = 0; f < num_of_features; ++f ) {
// Print the Feature name, and a colon.
//
os << "a" << f+1 << ": ";
if ( Features[f]->Ignore() )
os << "Ignore" << endl;
else if ( Features[f]->isNumerical() )
os << "Numeric" << endl;
else {
// Loop over the values.
//
VCarrtype::const_iterator it2 = Features[f]->ValuesArray.begin();
while( it2 != Features[f]->ValuesArray.end() ){
os << (FeatureValue *)*it2;
++it2;
if ( it2 != Features[f]->ValuesArray.end() )
os << ",";
}
os << "." << endl;
}
}
}
return result;
}
bool MBLClass::Chop( const string& line ) {
try {
return ChopInput->chop( line, num_of_features );
}
catch ( const exception& e ){
Warning( e.what() );
return false;
}
}
bool MBLClass::setInputFormat( const InputFormatType IF ){
if ( ChopInput ){
delete ChopInput;
ChopInput = 0;
}
ChopInput = Chopper::create( IF, chopExamples(), F_length, chopOcc() );
if ( ChopInput ){
input_format = IF;
return true;
}
return false;
}
const ValueDistribution *MBLClass::ExactMatch( const Instance& inst ) const {
const ValueDistribution *result = NULL;
if ( !GlobalMetric->isSimilarityMetric() &&
( do_exact_match ||
( num_of_neighbors == 1 &&
!( Verbosity( NEAR_N | ALL_K) ) ) ) ){
result = InstanceBase->ExactMatch( inst );
}
return result;
}
double MBLClass::getBestDistance() const {
return nSet.bestDistance();
}
WValueDistribution *MBLClass::getBestDistribution( unsigned int k ){
return nSet.bestDistribution( decay, k );
}
string MBLClass::formatInstance( const vector& OrgFV,
vector& RedFV,
size_t OffSet,
size_t Size ) const {
string result;
Instance inst( Size );
for ( size_t i=0; i< OffSet; ++i )
inst.FV[i] = OrgFV[i];
for ( size_t j=OffSet; j< Size; ++j )
inst.FV[j] = RedFV[j-OffSet];
size_t *InvPerm = new size_t[num_of_features];
for ( size_t i=0; i< num_of_features; ++i )
InvPerm[permutation[i]] = i;
for ( size_t j=0; j< num_of_features; ++j ){
switch ( input_format ) {
case C4_5:
// fall through
case ARFF:
if ( Features[j]->Ignore() )
result += "-*-,";
else
result += inst.FV[InvPerm[j]]->Name() + ",";
break;
case Sparse:
if ( inst.FV[InvPerm[j]]->Name() != DefaultSparseString )
result += string("(") + toString(j+1) + ","
+ CodeToStr( inst.FV[InvPerm[j]]->Name() ) + ")";
break;
case SparseBin:
if ( inst.FV[InvPerm[j]]->Name()[0] == '1' )
result += toString( j+1 ) + ",";
break;
case Columns:
if ( Features[j]->Ignore() )
result += "-*- ";
else
result += inst.FV[InvPerm[j]]->Name() + " ";
break;
case Tabbed:
if ( Features[j]->Ignore() )
result += "-*- ";
else
result += inst.FV[InvPerm[j]]->Name() + "\t";
break;
default:
if ( Features[j]->Ignore() )
result += string( F_length, '*' );
else
result += inst.FV[InvPerm[j]]->Name();
break;
}
}
delete [] InvPerm;
return result;
}
inline double WeightFun( double D, double W ){
return D / (W + Common::Epsilon);
}
void MBLClass::test_instance_ex( const Instance& Inst,
InstanceBase_base *IB,
size_t ib_offset ){
vector CurrentFV(num_of_features);
size_t EffFeat = effective_feats - ib_offset;
const ValueDistribution *best_distrib = IB->InitGraphTest( CurrentFV,
&Inst.FV,
ib_offset,
effective_feats );
tester->init( Inst, effective_feats, ib_offset );
ValueDistribution::dist_iterator lastpos;
Vfield *Bpnt = NULL;
if ( best_distrib ){
lastpos = best_distrib->begin();
if ( lastpos != best_distrib->end() )
Bpnt = lastpos->second;
}
size_t CurPos = 0;
while ( Bpnt ) {
// call test() with a maximum threshold, to prevent stepping out early
size_t EndPos = tester->test( CurrentFV,
CurPos,
DBL_MAX );
if ( EndPos != EffFeat ){
throw( logic_error( "Exemplar testing: test should not stop before last feature" ) );
}
ValueDistribution ResultDist;
ResultDist.SetFreq( Bpnt->Value(), Bpnt->Freq() );
string origI;
if ( Verbosity(NEAR_N) ){
origI = formatInstance( Inst.FV, CurrentFV,
ib_offset,
num_of_features );
}
double Distance = WeightFun( tester->getDistance(EndPos),
Bpnt->Weight() );
bestArray.addResult( Distance, &ResultDist, origI );
CurPos = EndPos-1;
++lastpos;
if ( lastpos != best_distrib->end() ){
Bpnt = lastpos->second;
}
else {
best_distrib = IB->NextGraphTest( CurrentFV,
CurPos );
Bpnt = NULL;
if ( best_distrib ){
lastpos = best_distrib->begin();
if ( lastpos != best_distrib->end() ){
Bpnt = lastpos->second;
}
}
}
}
}
void MBLClass::initDecay(){
if ( decay ){
delete decay;
decay = 0;
}
switch ( decay_flag ){
case InvDist:
decay = new invDistDecay();
break;
case InvLinear:
decay = new invLinDecay();
break;
case ExpDecay:
decay = new expDecay( decay_alfa, decay_beta );
break;
case Zero: // fall through
default:
break;
}
}
void MBLClass::initTesters() {
delete GlobalMetric;
GlobalMetric = getMetricClass( globalMetricOption );
delete tester;
tester = getTester( globalMetricOption, Features, permutation, mvd_threshold );
}
void MBLClass::test_instance( const Instance& Inst,
InstanceBase_base *IB,
size_t ib_offset ){
vector CurrentFV(num_of_features);
double Threshold = DBL_MAX;
size_t EffFeat = effective_feats - ib_offset;
const ValueDistribution *best_distrib = IB->InitGraphTest( CurrentFV,
&Inst.FV,
ib_offset,
effective_feats );
tester->init( Inst, effective_feats, ib_offset );
// cerr << "start test Instance = " << &Inst << " met " << toString(CurrentFV) << endl;
// cerr << "BA at start = " << bestArray << endl;
size_t CurPos = 0;
while ( best_distrib ){
// cerr << "test:" << toString(CurrentFV) << endl;
size_t EndPos = tester->test( CurrentFV,
CurPos,
Threshold + Epsilon );
// cerr << "EndPos = " << EndPos << endl;
if ( EndPos == EffFeat ){
// we finished with a certain amount of succes
double Distance = tester->getDistance(EndPos);
if ( Distance >= 0.0 ){
string origI;
if ( Verbosity(NEAR_N) ){
origI = formatInstance( Inst.FV, CurrentFV,
ib_offset,
num_of_features );
}
// cerr << "Ok add " << best_distrib << "at distance " << Distance << endl;
Threshold = bestArray.addResult( Distance, best_distrib, origI );
// cerr << "BA = " << bestArray << endl;
if ( do_silly_testing )
Threshold = DBL_MAX;
}
else {
Error( "DISTANCE == " + toString(Distance) );
FatalError( "we are dead" );
}
}
else {
EndPos++; // out of luck, compensate for roll-back
}
size_t pos=EndPos-1;
// cerr << "start rollback " << pos << endl;
while ( true ){
// cerr << "rollback " << pos << endl;
if ( tester->getDistance(pos) <= Threshold ){
CurPos = pos;
// cerr << "voor next test " << endl;
best_distrib = IB->NextGraphTest( CurrentFV,
CurPos );
// cerr << "na next test, curpos=" << CurPos << "-" << toString(CurrentFV) << endl;
break;
}
if ( pos == 0 )
break;
--pos;
}
}
// cerr << "BA at end = " << bestArray << endl;
}
void MBLClass::test_instance_sim( const Instance& Inst,
InstanceBase_base *IB,
size_t ib_offset ){
vector CurrentFV(num_of_features);
size_t EffFeat = effective_feats - ib_offset;
const ValueDistribution *best_distrib = IB->InitGraphTest( CurrentFV,
&Inst.FV,
ib_offset,
effective_feats );
tester->init( Inst, effective_feats, ib_offset );
size_t CurPos = 0;
while ( best_distrib ){
double dummy = -1.0;
size_t EndPos = tester->test( CurrentFV,
CurPos,
dummy );
if ( EndPos == EffFeat ){
// we finished with a certain amount of succes
double Distance = tester->getDistance(EndPos);
if ( Distance >= 0.0 ){
string origI;
if ( Verbosity(NEAR_N) ){
origI = formatInstance( Inst.FV, CurrentFV,
ib_offset,
num_of_features );
}
bestArray.addResult( Distance, best_distrib, origI );
}
else if ( GlobalMetric->type() == DotProduct ){
Error( "The Dot Product metric fails on your data: intermediate result too big to handle," );
Info( "you might consider using the Cosine metric '-mC' " );
FatalError( "timbl terminated" );
}
else {
Error( "DISTANCE == " + toString(Distance) );
FatalError( "we are dead" );
}
}
else {
EndPos++; // out of luck, compensate for roll-back
}
if ( EndPos > 0 ){
CurPos = EndPos-1;
best_distrib = IB->NextGraphTest( CurrentFV,
CurPos );
}
}
}
void MBLClass::TestInstance( const Instance& Inst,
InstanceBase_base *SubTree,
size_t level ){
// must be cleared for EVERY test
if ( doSamples() ){
test_instance_ex( Inst, SubTree, level );
}
else {
if ( GlobalMetric->isSimilarityMetric( ) )
test_instance_sim( Inst, SubTree, level );
else
test_instance( Inst, SubTree, level );
}
}
size_t MBLClass::countFeatures( const string& inBuffer,
const InputFormatType IF ) const {
size_t result = 0;
if ( IF == Sparse || IF == SparseBin )
return num_of_features;
else {
try {
result = Chopper::countFeatures( inBuffer, IF, F_length,
chopExamples() || chopOcc() );
}
catch( const runtime_error& e ){
Error( e.what() );
}
catch( const exception& e ){
FatalError( e.what() );
}
}
return result;
}
InputFormatType MBLClass::getInputFormat( const string& inBuffer ) const {
return Chopper::getInputFormat( inBuffer, chopExamples() || chopOcc() );
}
size_t MBLClass::examineData( const string& FileName ){
// Looks at the data files, counts num_of_features.
// and sets input_format variables.
//
size_t NumF = 0;
InputFormatType IF = UnknownInputFormat;
// Open the file.
//
if ( FileName == "" ) {
Warning( "couldn't initialize: No FileName specified " );
return 0;
}
else {
string Buffer;
ifstream datafile( FileName.c_str(), ios::in);
if (!datafile) {
Warning( "can't open DataFile: " + FileName );
return 0;
}
else if ( input_format != UnknownInputFormat ){
// The format is somehow already known, so use that
if ( input_format == SparseBin || input_format == Sparse )
NumF = MaxFeatures;
else {
if ( !getline( datafile, Buffer ) ) {
Warning( "empty data file" );
}
else {
bool more = true;
if ( input_format == ARFF ){
while ( !compare_nocase_n( "@DATA", Buffer ) ){
if ( !getline( datafile, Buffer ) ){
Warning( "empty data file" );
more = false;
break;
};
}
if ( more && !getline( datafile, Buffer ) ){
Warning( "empty data file" );
more = false;
};
}
while ( more && empty_line( Buffer, input_format ) ){
if ( !getline( datafile, Buffer ) ){
Warning( "empty data file" );
more = false;
};
}
// now we have a usable line,
//analyze it using the User defined input_format
NumF = countFeatures( Buffer, input_format );
}
}
IF = input_format;
}
else if ( !getline( datafile, Buffer ) ){
Warning( "empty data file: " + FileName );
}
// We start by reading the first line so we can figure out the number
// of Features, and see if the file is comma seperated or not, etc.
//
else {
if ( IF == ARFF ){
// Remember, we DON't want to auto-detect ARFF
while ( !compare_nocase_n( "@DATA", Buffer ) ){
if ( !getline( datafile, Buffer ) ) {
Warning( "no ARRF data after comments: " + FileName );
return 0;
}
}
do {
if ( !getline( datafile, Buffer ) ) {
Warning( "no ARRF data after comments: " + FileName );
return 0;
}
} while ( empty_line( Buffer, input_format ) );
}
else {
while ( empty_line( Buffer, input_format ) ) {
if ( !getline( datafile, Buffer ) ) {
Warning( "no data after comments: " + FileName );
return 0;
}
}
// We found a useful line!
// Now determine the input_format (if not already known,
// and Count Features as well.
}
IF = getInputFormat( Buffer );
NumF = countFeatures( Buffer, IF );
}
}
if ( NumF > 0 ){
if ( input_format != UnknownInputFormat &&
input_format != IF ){
Warning( "assumed inputformat differs from specified!" );
return 0;
}
else {
if ( NumF > MaxFeatures ){
Error( "Number of Features exceeds the maximum number. "
"(currently " + toString(MaxFeatures) +
")\nPlease increase.\n" );
return 0;
}
setInputFormat( IF );
}
}
return NumF;
}
void MBLClass::Initialize( size_t n ){
if ( n > 0 )
num_of_features = n;
// Allocate memory. Will be reused again and again ....
//
if ( target_pos == std::numeric_limits::max() )
target_pos = num_of_features; // the default
else if ( target_pos > num_of_features )
FatalError( "Initialize: TARGET_POS cannot exceed NUM_OF_FEATURES+1 " +
toString( num_of_features+1 ) );
Features.resize(num_of_features,NULL);
PermFeatures.resize(num_of_features,NULL);
FeatureStrings = new Hash::StringHash(); // all features share the same hash
TargetStrings = new Hash::StringHash(); // targets has it's own hash
Targets = new Target( TargetStrings );
for ( size_t i=0; i< num_of_features; ++i ){
Features[i] = new Feature( FeatureStrings );
PermFeatures[i] = NULL; //Features[i];
}
CurrInst.Init( num_of_features );
// the user thinks about features running from 1 to Num
// we know better, so shift one down.
effective_feats = num_of_features;
num_of_num_features = 0;
delete GlobalMetric;
GlobalMetric = getMetricClass( globalMetricOption );
for ( size_t j = 0; j < num_of_features; ++j ){
MetricType m = UserOptions[j+1];
if ( m == Ignore ){
Features[j]->Ignore( true );
effective_feats--;
}
else {
Features[j]->setMetricType( m );
if ( Features[j]->isNumerical() ){
num_of_num_features++;
}
}
}
Options.FreezeTable();
if ( Weighting > IG_w ||
TreeOrder >= X2Order )
need_all_weights = true;
}
} // namespace