/* $Id: Choppers.cxx 15828 2013-03-28 11:55:53Z sloot $ $URL: https://ilk.uvt.nl/svn/trunk/sources/Timbl6/src/Choppers.cxx $ Copyright (c) 1998 - 2013 ILK - Tilburg University CLiPS - University of Antwerp This file is part of timbl timbl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. timbl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . For questions and suggestions, see: http://ilk.uvt.nl/software.html or send mail to: timbl@uvt.nl */ #include #include #include #include #include #include #include "ticcutils/StringOps.h" #include "timbl/Types.h" #include "timbl/Choppers.h" using namespace std; using namespace TiCC; namespace Timbl{ Chopper *Chopper::create( InputFormatType IF, bool doEx, int fLen, bool doOcc ){ Chopper *result = 0; switch ( IF ){ case C4_5: if ( doOcc ) result = new C45_OccChopper(); else if ( doEx ) result = new C45_ExChopper(); else result = new C45_Chopper(); break; case ARFF: if ( doOcc ) result = new ARFF_OccChopper(); else if ( doEx ) result = new ARFF_ExChopper(); else result = new ARFF_Chopper(); break; case SparseBin: if ( doOcc ) result = new Bin_OccChopper(); else if ( doEx ) result = new Bin_ExChopper(); else result = new Bin_Chopper(); break; case Sparse: if ( doOcc ) result = new Sparse_OccChopper(); else if ( doEx ) result = new Sparse_ExChopper(); else result = new Sparse_Chopper(); break; case Columns: if ( doOcc ) result = new Columns_OccChopper(); else if ( doEx ) result = new Columns_ExChopper(); else result = new Columns_Chopper(); break; case Tabbed: if ( doOcc ) result = new Tabbed_OccChopper(); else if ( doEx ) result = new Tabbed_ExChopper(); else result = new Tabbed_Chopper(); break; case Compact: if ( doOcc ) result = new Compact_OccChopper( fLen ); else if ( doEx ) result = new Compact_ExChopper( fLen ); else result = new Compact_Chopper( fLen ); break; default: break; } return result; } void Chopper::init( const string& s, size_t len, bool stripDot ) { strippedInput = s; vSize = len+1; choppedInput.resize(vSize); string::iterator it = strippedInput.end(); --it; // first trim trailing spaces while ( it != strippedInput.begin() && isspace(*it) ) --it; strippedInput.erase( ++it , strippedInput.end() ); it = strippedInput.end(); --it; if ( stripDot ){ // first trim trailing dot if ( it != strippedInput.begin() && *it == '.' ) --it; } // strip remaining trailing spaces while ( it != strippedInput.begin() && isspace(*it) ) --it; strippedInput.erase( ++it , strippedInput.end() ); } static string stripExemplarWeight( const string& Buffer, string& wght ) { string::size_type t_pos, e_pos = Buffer.length(); // first remove trailing whitespace and dot e_pos = Buffer.find_last_not_of( ". \t", e_pos ); // now some non-space t_pos = Buffer.find_last_of( " \t", e_pos ); if ( t_pos != string::npos ){ // found white space wght = string( Buffer, t_pos+1, e_pos - t_pos ); } else { wght = ""; } // and some more space... e_pos = Buffer.find_last_not_of( " \t", t_pos ); return string( Buffer, 0, e_pos+1 ); } static string stripOcc( const string& Buffer, string& wght ) { return stripExemplarWeight( Buffer, wght ); } size_t Chopper::countFeatures( const string& inBuffer, InputFormatType IF, int F_length, bool chopTail ) { size_t result = 0; string buffer; if ( chopTail ){ string dummy; buffer = stripExemplarWeight( inBuffer, dummy ); } else buffer = inBuffer; size_t len = buffer.length(); switch ( IF ){ case ARFF: case C4_5: for ( size_t i = 0; i < len; ++i ) { if (buffer[i] == ',') result++; }; break; case Compact: if ( F_length == 0 ){ throw runtime_error( "-F Compact specified, but Feature Length not set." " (-l option)" ); return result; } else result = (len / F_length) - 1; break; case Columns: for ( size_t j = 0; j < len; ++j ) { if ( isspace(buffer[j]) ){ result++; while ( isspace( buffer[++j] ) ){}; if ( buffer[j] == '\0' ) result--; // we had some trailing spaces } }; break; case Tabbed: for ( size_t j = 0; j < len; ++j ) { if ( buffer[j] == '\t' ){ result++; while ( buffer[++j] == '\t' ){}; if ( buffer[j] == '\0' ) result--; // we had some trailing spaces } }; break; default: throw logic_error( "CountFeatures: Illegal value in switch:" + toString(IF) ); }; return result; } InputFormatType Chopper::getInputFormat( const string& inBuffer, bool stripTail ) { InputFormatType IF = UnknownInputFormat; string buffer; if ( stripTail ){ string dummy; buffer = stripExemplarWeight( inBuffer, dummy ); } else buffer = inBuffer; size_t len = buffer.length(); int c45Cnt = 0; int columnCnt = 0; for ( unsigned int i = 0; i < len; ++i ) { if ( buffer[i] == ',' ) { ++c45Cnt; } else if ( isspace( buffer[i] ) ){ ++columnCnt; while ( i < len && isspace( buffer[i+1] ) ) ++i; if ( i >= len-1 ){ // just trailing spaces! --columnCnt; } } } if ( columnCnt == 0 && c45Cnt == 0 ) IF = Compact; else if ( c45Cnt >= columnCnt ) IF = C4_5; else IF = Columns; return IF; } void ExChopper::init( const string& s, size_t len, bool stripDot ) { exW = -1.0; strippedInput = s; vSize = len+1; choppedInput.resize(vSize); string::iterator it = strippedInput.end(); --it; // first trim trailing spaces while ( it != strippedInput.begin() && isspace(*it) ) --it; strippedInput.erase( ++it , strippedInput.end() ); string wght; strippedInput = stripExemplarWeight( strippedInput, wght ); if ( wght.empty() ){ throw logic_error( "Missing sample weight" ); } else { double tmp; if ( !stringTo( wght, tmp ) ){ throw runtime_error( "Wrong sample weight: '" + wght + "'" ); } else { exW = tmp; } } it = strippedInput.end(); --it; if ( stripDot ){ // first trim trailing dot if ( it != strippedInput.begin() && *it == '.' ) --it; } // strip remaining trailing spaces while ( it != strippedInput.begin() && isspace(*it) ) --it; strippedInput.erase( ++it , strippedInput.end() ); } void OccChopper::init( const string& s, size_t len, bool stripDot ) { occ = 1; strippedInput = s; vSize = len+1; choppedInput.resize(vSize); string::iterator it = strippedInput.end(); --it; // first trim trailing spaces while ( it != strippedInput.begin() && isspace(*it) ) --it; strippedInput.erase( ++it , strippedInput.end() ); string occS; strippedInput = stripOcc( strippedInput, occS ); if ( occS.empty() ){ throw logic_error( "Missing occurence" ); } else { int tmp; if ( !stringTo( occS, tmp ) ){ throw runtime_error( "Wrong (non-integer) occurence value: '" + occS + "'" ); } else { occ = tmp; } } it = strippedInput.end(); --it; if ( stripDot ){ // first trim trailing dot if ( it != strippedInput.begin() && *it == '.' ) --it; } // strip remaining trailing spaces while ( it != strippedInput.begin() && isspace(*it) ) --it; strippedInput.erase( ++it , strippedInput.end() ); } bool C45_Chopper::chop( const string& InBuf, size_t len ){ // Function that takes a line, and chops it up into substrings, // which represent the feature-values and the target-value. init( InBuf, len, true ); vector splits; size_t res = TiCC::split_at( strippedInput, splits, "," ); if ( res != vSize ) return false; for ( size_t i=0; i < res ; ++i ){ choppedInput[i] = StrToCode( splits[i] ); } return true; } string C45_Chopper::getString() const{ string res; for ( size_t i = 0; i < vSize; ++i ) { res += CodeToStr( choppedInput[i] ) + ","; } return res; } bool ARFF_Chopper::chop( const string& InBuf, size_t len ){ // Lines look like this: // one, two, three , bla. // the termination dot is optional // WhiteSpace is skipped! return C45_Chopper::chop( InBuf, len ); } bool Bin_Chopper::chop( const string& InBuf, size_t len ) { // Lines look like this: // 12, 25, 333, bla. // the termination dot is optional init( InBuf, len, true ); for ( size_t m = 0; m < vSize-1; ++m ) choppedInput[m] = "0"; string::size_type s_pos = 0; string::size_type e_pos = strippedInput.find( ',' ); while ( e_pos != string::npos ){ string tmp = string( strippedInput, s_pos, e_pos - s_pos ); size_t k; if ( !stringTo( tmp, k, 1, vSize-1 ) ) return false; else choppedInput[k-1] = "1"; s_pos = e_pos + 1; e_pos = strippedInput.find( ',', s_pos ); } choppedInput[vSize-1] = string( strippedInput, s_pos ); return true; } string Bin_Chopper::getString() const { string res; for ( size_t i = 0; i < vSize-1; ++i ) { if ( choppedInput[i][0] == '1' ) res += toString(i+1) + ","; } res += choppedInput[vSize-1] + ","; return res; } bool Compact_Chopper::chop( const string& InBuf, size_t leng ){ init( InBuf, leng, false ); size_t i; // Lines look like this: // ====AKBVAK // v1v2v3v4tt // Get & add the target. // size_t len = strippedInput.length(); if ( len != vSize * fLen ){ return false; } for ( i = 0; i < vSize; ++i ) { size_t index = i * fLen; // Scan the value. // choppedInput[i] = ""; for ( int j = 0; j < fLen; ++j ) { choppedInput[i] += strippedInput[index++]; } } return ( i == vSize ); // Enough? } string Compact_Chopper::getString() const { string res; for ( size_t i = 0; i < vSize; ++i ) { res += CodeToStr( choppedInput[i] ); } return res; } bool Columns_Chopper::chop( const string& InBuf, size_t len ){ // Lines look like this: // one two three bla init( InBuf, len, false ); unsigned int i = 0; string::size_type s_pos = 0; string::size_type e_pos = strippedInput.find_first_of( " \t" ); while ( e_pos != s_pos && e_pos != string::npos && i < vSize ){ // stop if a zero length string is found or if too many entries show up choppedInput[i++] = string( strippedInput, s_pos, e_pos - s_pos ); s_pos = strippedInput.find_first_not_of( " \t", e_pos ); e_pos = strippedInput.find_first_of( " \t", s_pos ); } if ( e_pos != string::npos ) return false; if ( s_pos != string::npos && i < vSize ){ choppedInput[i++] = string( strippedInput, s_pos ); } return ( i == vSize ); // Enough? } string Columns_Chopper::getString() const { string res; for ( size_t i = 0; i < vSize; ++i ) { res += choppedInput[i] + " "; } return res; } bool Tabbed_Chopper::chop( const string& InBuf, size_t len ){ // Lines look like this: // one two three bla init( InBuf, len, false ); unsigned int i = 0; string::size_type s_pos = 0; string::size_type e_pos = strippedInput.find_first_of( "\t" ); while ( e_pos != s_pos && e_pos != string::npos && i < vSize ){ // stop if a zero length string is found or if too many entries show up choppedInput[i++] = StrToCode( string( strippedInput, s_pos, e_pos - s_pos ) ); s_pos = strippedInput.find_first_not_of( "\t", e_pos ); e_pos = strippedInput.find_first_of( "\t", s_pos ); } if ( e_pos != string::npos ) return false; if ( s_pos != string::npos && i < vSize ){ choppedInput[i++] = StrToCode( string( strippedInput, s_pos ) ); } return ( i == vSize ); // Enough? } string Tabbed_Chopper::getString() const { string res; for ( size_t i = 0; i < vSize; ++i ) { res += CodeToStr( choppedInput[i] ) + "\t"; } return res; } bool Sparse_Chopper::chop( const string& InBuf, size_t len ){ // Lines look like this: // (12,value1) (25,value2) (333,value3) bla. // the termination dot is optional init( InBuf, len, true ); for ( size_t m = 0; m < vSize-1; ++m ) choppedInput[m] = DefaultSparseString; choppedInput[vSize-1] = ""; string::size_type s_pos = strippedInput.find( "(" ); if ( s_pos == string::npos ) choppedInput[vSize-1] = TiCC::trim(strippedInput); else { string::size_type m_pos, e_pos = strippedInput.find( ")" ); while ( s_pos < e_pos && s_pos != string::npos && e_pos != string::npos ){ m_pos = strippedInput.find( ',', s_pos ); string temp = string( strippedInput, s_pos + 1, m_pos - s_pos - 1 ); size_t k = 0; if ( !stringTo( temp, k, 1, vSize-1 ) ) return false; else { choppedInput[k-1] = string( strippedInput, m_pos + 1, e_pos - m_pos -1 ); choppedInput[k-1] = StrToCode( choppedInput[k-1] ); } s_pos = strippedInput.find( '(', e_pos ); if ( s_pos == string::npos ){ e_pos = strippedInput.find_first_not_of( ") \t", e_pos ); if ( e_pos != string::npos ){ choppedInput[vSize-1] = string( strippedInput, e_pos ); choppedInput[vSize-1] = TiCC::trim( choppedInput[vSize-1] ); } } else e_pos = strippedInput.find( ')', s_pos ); } } return !choppedInput[vSize-1].empty(); } string Sparse_Chopper::getString() const { string res; for ( size_t i = 0; i < vSize-1; ++i ) { if ( choppedInput[i] != DefaultSparseString ) res += "(" + toString( i+1 ) + "," + CodeToStr(choppedInput[i]) + ")"; } res += choppedInput[vSize-1] + ","; return res; } }