NERsuite
1.1.1
|
00001 /* 00002 * $Id: maxent.h,v 1.1.1.1 2010/12/17 07:27:40 hccho Exp $ 00003 */ 00004 00005 #ifndef __MAXENT_H_ 00006 #define __MAXENT_H_ 00007 00008 #include <string> 00009 #include <vector> 00010 #include <list> 00011 #include <map> 00012 #include <algorithm> 00013 #include <iostream> 00014 #include <string> 00015 #include <cassert> 00016 //#include "blmvm.h" 00017 00018 //#define USE_HASH_MAP // if you encounter errors with hash, try commenting out this line. (the program will be a bit slower, though) 00019 #ifdef USE_HASH_MAP 00020 #include <ext/hash_map> 00021 #endif 00022 00023 // 00024 // data format for each sample for training/testing 00025 // 00026 struct ME_Sample 00027 { 00028 public: 00029 ME_Sample() : label("") {}; 00030 ME_Sample(const std::string & l) : label(l) {}; 00031 void set_label(const std::string & l) { label = l; } 00032 00033 // to add a binary feature 00034 void add_feature(const std::string & f) { 00035 features.push_back(f); 00036 } 00037 00038 // to add a real-valued feature 00039 void add_feature(const std::string & s, const double d) { 00040 rvfeatures.push_back(std::pair<std::string, double>(s, d)); 00041 } 00042 00043 public: 00044 std::string label; 00045 std::vector<std::string> features; 00046 std::vector<std::pair<std::string, double> > rvfeatures; 00047 00048 // obsolete 00049 void add_feature(const std::pair<std::string, double> & f) { 00050 rvfeatures.push_back(f); // real-valued features 00051 } 00052 }; 00053 00054 00055 // 00056 // for those who want to use load_from_array() 00057 // 00058 typedef struct ME_Model_Data 00059 { 00060 char * label; 00061 char * feature; 00062 double weight; 00063 } ME_Model_Data; 00064 00065 00066 class ME_Model 00067 { 00068 public: 00069 00070 void add_training_sample(const ME_Sample & s); 00071 int train(const int cutoff = 0, const double sigma = 0, const double widthfactor = 0); 00072 std::vector<double> classify(ME_Sample & s) const; 00073 bool load_from_file(const std::string & filename); 00074 bool save_to_file(const std::string & filename) const; 00075 int num_classes() const { return _num_classes; } 00076 std::string get_class_label(int i) const { return _label_bag.Str(i); } 00077 int get_class_id(const std::string & s) const { return _label_bag.Id(s); } 00078 void get_features(std::list< std::pair< std::pair<std::string, std::string>, double> > & fl); 00079 void set_heldout(const int h, const int n = 0) { _nheldout = h; _early_stopping_n = n; }; 00080 bool load_from_array(const ME_Model_Data data[]); 00081 void set_reference_model(const ME_Model & ref_model) { _ref_modelp = &ref_model; }; 00082 00083 ME_Model() { 00084 _nheldout = 0; 00085 _early_stopping_n = 0; 00086 _ref_modelp = NULL; 00087 } 00088 00089 public: 00090 // obsolete. just for downward compatibility 00091 int train(const std::vector<ME_Sample> & train, 00092 const int cutoff = 0, const double sigma = 0, const double widthfactor = 0); 00093 00094 private: 00095 00096 struct Sample { 00097 int label; 00098 std::vector<int> positive_features; 00099 std::vector<std::pair<int, double> > rvfeatures; 00100 std::vector<double> ref_pd; // reference probability distribution 00101 bool operator<(const Sample & x) const { 00102 for (int i = 0; i < positive_features.size(); i++) { 00103 if (i >= x.positive_features.size()) return false; 00104 int v0 = positive_features[i]; 00105 int v1 = x.positive_features[i]; 00106 if (v0 < v1) return true; 00107 if (v0 > v1) return false; 00108 } 00109 return false; 00110 } 00111 }; 00112 00113 struct ME_Feature 00114 { 00115 enum { MAX_LABEL_TYPES = 255 }; 00116 00117 // ME_Feature(const int l, const int f) : _body((l << 24) + f) { 00118 // assert(l >= 0 && l < 256); 00119 // assert(f >= 0 && f <= 0xffffff); 00120 // }; 00121 // int label() const { return _body >> 24; } 00122 // int feature() const { return _body & 0xffffff; } 00123 ME_Feature(const int l, const int f) : _body((f << 8) + l) { 00124 assert(l >= 0 && l <= MAX_LABEL_TYPES); 00125 assert(f >= 0 && f <= 0xffffff); 00126 }; 00127 int label() const { return _body & 0xff; } 00128 int feature() const { return _body >> 8; } 00129 unsigned int body() const { return _body; } 00130 private: 00131 unsigned int _body; 00132 }; 00133 00134 struct ME_FeatureBag 00135 { 00136 #ifdef USE_HASH_MAP 00137 typedef __gnu_cxx::hash_map<unsigned int, int> map_type; 00138 #else 00139 typedef std::map<unsigned int, int> map_type; 00140 #endif 00141 map_type mef2id; 00142 std::vector<ME_Feature> id2mef; 00143 int Put(const ME_Feature & i) { 00144 map_type::const_iterator j = mef2id.find(i.body()); 00145 if (j == mef2id.end()) { 00146 int id = id2mef.size(); 00147 id2mef.push_back(i); 00148 mef2id[i.body()] = id; 00149 return id; 00150 } 00151 return j->second; 00152 } 00153 int Id(const ME_Feature & i) const { 00154 map_type::const_iterator j = mef2id.find(i.body()); 00155 if (j == mef2id.end()) { 00156 return -1; 00157 } 00158 return j->second; 00159 } 00160 ME_Feature Feature(int id) const { 00161 assert(id >= 0 && id < (int)id2mef.size()); 00162 return id2mef[id]; 00163 } 00164 int Size() const { 00165 return id2mef.size(); 00166 } 00167 void Clear() { 00168 mef2id.clear(); 00169 id2mef.clear(); 00170 } 00171 }; 00172 00173 struct hashfun_str 00174 { 00175 size_t operator()(const std::string& s) const { 00176 assert(sizeof(int) == 4 && sizeof(char) == 1); 00177 const int* p = reinterpret_cast<const int*>(s.c_str()); 00178 size_t v = 0; 00179 int n = s.size() / 4; 00180 for (int i = 0; i < n; i++, p++) { 00181 // v ^= *p; 00182 v ^= *p << (4 * (i % 2)); // note) 0 <= char < 128 00183 } 00184 int m = s.size() % 4; 00185 for (int i = 0; i < m; i++) { 00186 v ^= s[4 * n + i] << (i * 8); 00187 } 00188 return v; 00189 } 00190 }; 00191 00192 struct MiniStringBag 00193 { 00194 #ifdef USE_HASH_MAP 00195 typedef __gnu_cxx::hash_map<std::string, int, hashfun_str> map_type; 00196 #else 00197 typedef std::map<std::string, int> map_type; 00198 #endif 00199 int _size; 00200 map_type str2id; 00201 MiniStringBag() : _size(0) {} 00202 int Put(const std::string & i) { 00203 map_type::const_iterator j = str2id.find(i); 00204 if (j == str2id.end()) { 00205 int id = _size; 00206 _size++; 00207 str2id[i] = id; 00208 return id; 00209 } 00210 return j->second; 00211 } 00212 int Id(const std::string & i) const { 00213 map_type::const_iterator j = str2id.find(i); 00214 if (j == str2id.end()) return -1; 00215 return j->second; 00216 } 00217 int Size() const { return _size; } 00218 void Clear() { str2id.clear(); _size = 0; } 00219 map_type::const_iterator begin() const { return str2id.begin(); } 00220 map_type::const_iterator end() const { return str2id.end(); } 00221 }; 00222 00223 struct StringBag : public MiniStringBag 00224 { 00225 std::vector<std::string> id2str; 00226 int Put(const std::string & i) { 00227 map_type::const_iterator j = str2id.find(i); 00228 if (j == str2id.end()) { 00229 int id = id2str.size(); 00230 id2str.push_back(i); 00231 str2id[i] = id; 00232 return id; 00233 } 00234 return j->second; 00235 } 00236 std::string Str(const int id) const { 00237 assert(id >= 0 && id < (int)id2str.size()); 00238 return id2str[id]; 00239 } 00240 int Size() const { return id2str.size(); } 00241 void Clear() { 00242 str2id.clear(); 00243 id2str.clear(); 00244 } 00245 }; 00246 00247 std::vector<Sample> _vs; // vector of training_samples 00248 StringBag _label_bag; 00249 MiniStringBag _featurename_bag; 00250 double _sigma; // Gaussian prior 00251 double _inequality_width; 00252 std::vector<double> _vl; // vector of lambda 00253 std::vector<double> _va; // vector of alpha (for inequality ME) 00254 std::vector<double> _vb; // vector of beta (for inequality ME) 00255 ME_FeatureBag _fb; 00256 int _num_classes; 00257 std::vector<double> _vee; // empirical expectation 00258 std::vector<double> _vme; // empirical expectation 00259 std::vector< std::vector< int > > _feature2mef; 00260 std::vector< Sample > _heldout; 00261 double _train_error; // current error rate on the training data 00262 double _heldout_error; // current error rate on the heldout data 00263 int _nheldout; 00264 int _early_stopping_n; 00265 std::vector<double> _vhlogl; 00266 const ME_Model * _ref_modelp; 00267 00268 double heldout_likelihood(); 00269 int conditional_probability(const Sample & nbs, std::vector<double> & membp) const; 00270 int make_feature_bag(const int cutoff); 00271 int classify(const Sample & nbs, std::vector<double> & membp) const; 00272 double update_model_expectation(); 00273 int perform_LMVM(); 00274 int perform_GIS(int C); 00275 void set_ref_dist(Sample & s) const; 00276 void init_feature2mef(); 00277 00278 // BLMVM 00279 /* 00280 int BLMVMComputeFunctionGradient(BLMVM blmvm, BLMVMVec X,double *f,BLMVMVec G); 00281 int BLMVMComputeBounds(BLMVM blmvm, BLMVMVec XL, BLMVMVec XU); 00282 int BLMVMSolve(double *x, int n); 00283 int BLMVMFunctionGradient(double *x, double *f, double *g, int n); 00284 int BLMVMLowerAndUpperBounds(double *xl,double *xu,int n); 00285 int Solve_BLMVM(BLMVM blmvm, BLMVMVec X); 00286 */ 00287 }; 00288 00289 00290 #endif 00291 00292 00293 /* 00294 * $Log: maxent.h,v $ 00295 * Revision 1.1.1.1 2010/12/17 07:27:40 hccho 00296 * 00297 * 00298 * Revision 1.24 2006/08/21 17:30:38 tsuruoka 00299 * use MAX_LABEL_TYPES 00300 * 00301 * Revision 1.23 2006/07/25 13:19:53 tsuruoka 00302 * sort _vs[] 00303 * 00304 * Revision 1.22 2006/07/18 11:13:15 tsuruoka 00305 * modify comments 00306 * 00307 * Revision 1.21 2006/07/18 10:02:15 tsuruoka 00308 * remove sample2feature[] 00309 * speed up conditional_probability() 00310 * 00311 * Revision 1.20 2006/07/18 05:10:51 tsuruoka 00312 * add ref_dist 00313 * 00314 * Revision 1.19 2005/12/23 10:33:02 tsuruoka 00315 * support real-valued features 00316 * 00317 * Revision 1.18 2005/12/23 09:15:29 tsuruoka 00318 * modify _train to reduce memory consumption 00319 * 00320 * Revision 1.17 2005/10/28 13:02:34 tsuruoka 00321 * set_heldout(): add default value 00322 * Feature() 00323 * 00324 * Revision 1.16 2005/09/12 13:51:16 tsuruoka 00325 * Sample: list -> vector 00326 * 00327 * Revision 1.15 2005/09/12 13:27:10 tsuruoka 00328 * add add_training_sample() 00329 * 00330 * Revision 1.14 2005/04/27 11:22:27 tsuruoka 00331 * bugfix 00332 * ME_Sample: list -> vector 00333 * 00334 * Revision 1.13 2005/04/27 10:20:19 tsuruoka 00335 * MiniStringBag -> StringBag 00336 * 00337 * Revision 1.12 2005/04/27 10:00:42 tsuruoka 00338 * remove tmpfb 00339 * 00340 * Revision 1.11 2005/04/26 14:25:53 tsuruoka 00341 * add MiniStringBag, USE_HASH_MAP 00342 * 00343 * Revision 1.10 2004/10/04 05:50:25 tsuruoka 00344 * add Clear() 00345 * 00346 * Revision 1.9 2004/08/09 12:27:21 tsuruoka 00347 * change messages 00348 * 00349 * Revision 1.8 2004/08/04 13:55:19 tsuruoka 00350 * modify _sample2feature 00351 * 00352 * Revision 1.7 2004/07/29 05:51:13 tsuruoka 00353 * remove modeldata.h 00354 * 00355 * Revision 1.6 2004/07/28 13:42:58 tsuruoka 00356 * add AGIS 00357 * 00358 * Revision 1.5 2004/07/28 05:54:14 tsuruoka 00359 * get_class_name() -> get_class_label() 00360 * ME_Feature: bugfix 00361 * 00362 * Revision 1.4 2004/07/27 16:58:47 tsuruoka 00363 * modify the interface of classify() 00364 * 00365 * Revision 1.3 2004/07/26 17:23:46 tsuruoka 00366 * _sample2feature: list -> vector 00367 * 00368 * Revision 1.2 2004/07/26 15:49:23 tsuruoka 00369 * modify ME_Feature 00370 * 00371 * Revision 1.1 2004/07/26 13:10:55 tsuruoka 00372 * add files 00373 * 00374 * Revision 1.18 2004/07/22 08:34:45 tsuruoka 00375 * modify _sample2feature[] 00376 * 00377 * Revision 1.17 2004/07/21 16:33:01 tsuruoka 00378 * remove some comments 00379 * 00380 */