NERsuite  1.1.1
src/gtagger/geniatagger-3.0.1/maxent.h
00001 /*
00002  * $Id: maxent.h,v 1.1.1.1 2010/12/17 07:27:40 hccho Exp $
00003  */
00004 
00005 #ifndef __MAXENT_H_
00006 #define __MAXENT_H_
00007 
00008 #include <string>
00009 #include <vector>
00010 #include <list>
00011 #include <map>
00012 #include <algorithm>
00013 #include <iostream>
00014 #include <string>
00015 #include <cassert>
00016 //#include "blmvm.h"
00017 
00018 //#define USE_HASH_MAP  // if you encounter errors with hash, try commenting out this line. (the program will be a bit slower, though)
00019 #ifdef USE_HASH_MAP
00020 #include <ext/hash_map>
00021 #endif
00022 
00023 //
00024 // data format for each sample for training/testing
00025 //
00026 struct ME_Sample
00027 {
00028 public:
00029   ME_Sample() : label("") {};
00030   ME_Sample(const std::string & l) : label(l) {};
00031   void set_label(const std::string & l) { label = l; }
00032 
00033   // to add a binary feature
00034   void add_feature(const std::string & f) {
00035     features.push_back(f);   
00036   }
00037 
00038   // to add a real-valued feature
00039   void add_feature(const std::string & s, const double d) {
00040     rvfeatures.push_back(std::pair<std::string, double>(s, d)); 
00041   }
00042 
00043 public:
00044   std::string label;
00045   std::vector<std::string> features;
00046   std::vector<std::pair<std::string, double> > rvfeatures;
00047 
00048   // obsolete
00049   void add_feature(const std::pair<std::string, double> & f) {  
00050     rvfeatures.push_back(f); // real-valued features
00051   }
00052 };
00053 
00054 
00055 //
00056 // for those who want to use load_from_array()
00057 //
00058 typedef struct ME_Model_Data
00059 {
00060   char * label;
00061   char * feature;
00062   double weight;
00063 } ME_Model_Data;
00064 
00065 
00066 class ME_Model
00067 {
00068 public:
00069 
00070   void add_training_sample(const ME_Sample & s);
00071   int train(const int cutoff = 0, const double sigma = 0, const double widthfactor = 0);
00072   std::vector<double> classify(ME_Sample & s) const;
00073   bool load_from_file(const std::string & filename);
00074   bool save_to_file(const std::string & filename) const;
00075   int num_classes() const { return _num_classes; }
00076   std::string get_class_label(int i) const { return _label_bag.Str(i); }
00077   int get_class_id(const std::string & s) const { return _label_bag.Id(s); }
00078   void get_features(std::list< std::pair< std::pair<std::string, std::string>, double> > & fl);
00079   void set_heldout(const int h, const int n = 0) { _nheldout = h; _early_stopping_n = n; };
00080   bool load_from_array(const ME_Model_Data data[]);
00081   void set_reference_model(const ME_Model & ref_model) { _ref_modelp = &ref_model; };
00082 
00083   ME_Model() {
00084     _nheldout = 0;
00085     _early_stopping_n = 0;
00086     _ref_modelp = NULL;
00087   }
00088 
00089 public:
00090   // obsolete. just for downward compatibility
00091   int train(const std::vector<ME_Sample> & train,
00092             const int cutoff = 0, const double sigma = 0, const double widthfactor = 0);
00093 
00094 private:  
00095   
00096   struct Sample {
00097     int label;
00098     std::vector<int> positive_features;
00099     std::vector<std::pair<int, double> > rvfeatures;
00100     std::vector<double> ref_pd; // reference probability distribution
00101     bool operator<(const Sample & x) const {
00102       for (int i = 0; i < positive_features.size(); i++) {
00103         if (i >= x.positive_features.size()) return false;
00104         int v0 = positive_features[i];
00105         int v1 = x.positive_features[i];
00106         if (v0 < v1) return true;
00107         if (v0 > v1) return false;
00108       }
00109       return false;
00110     }
00111   };
00112 
00113   struct ME_Feature
00114   {
00115     enum { MAX_LABEL_TYPES = 255 };
00116       
00117     //    ME_Feature(const int l, const int f) : _body((l << 24) + f) {
00118     //      assert(l >= 0 && l < 256);
00119     //      assert(f >= 0 && f <= 0xffffff);
00120     //    };
00121     //    int label() const { return _body >> 24; }
00122     //    int feature() const { return _body & 0xffffff; }
00123     ME_Feature(const int l, const int f) : _body((f << 8) + l) {
00124       assert(l >= 0 && l <= MAX_LABEL_TYPES);
00125       assert(f >= 0 && f <= 0xffffff);
00126     };
00127     int label() const { return _body & 0xff; }
00128     int feature() const { return _body >> 8; }
00129     unsigned int body() const { return _body; }
00130   private:
00131     unsigned int _body;
00132   };
00133 
00134   struct ME_FeatureBag
00135   {
00136 #ifdef USE_HASH_MAP
00137     typedef __gnu_cxx::hash_map<unsigned int, int> map_type;
00138 #else    
00139     typedef std::map<unsigned int, int> map_type;
00140 #endif
00141     map_type mef2id;
00142     std::vector<ME_Feature> id2mef;
00143     int Put(const ME_Feature & i) {
00144       map_type::const_iterator j = mef2id.find(i.body());
00145       if (j == mef2id.end()) {
00146         int id = id2mef.size();
00147         id2mef.push_back(i);
00148         mef2id[i.body()] = id;
00149         return id;
00150       }
00151       return j->second;
00152     }
00153     int Id(const ME_Feature & i) const {
00154       map_type::const_iterator j = mef2id.find(i.body());
00155       if (j == mef2id.end()) {
00156         return -1;
00157       }
00158       return j->second;
00159     }
00160     ME_Feature Feature(int id) const {
00161       assert(id >= 0 && id < (int)id2mef.size());
00162       return id2mef[id];
00163     }
00164     int Size() const {
00165       return id2mef.size();
00166     }
00167     void Clear() {
00168       mef2id.clear();
00169       id2mef.clear();
00170     }
00171   };
00172 
00173   struct hashfun_str
00174   {
00175     size_t operator()(const std::string& s) const {
00176       assert(sizeof(int) == 4 && sizeof(char) == 1);
00177       const int* p = reinterpret_cast<const int*>(s.c_str());
00178       size_t v = 0;
00179       int n = s.size() / 4;
00180       for (int i = 0; i < n; i++, p++) {
00181         //      v ^= *p;
00182         v ^= *p << (4 * (i % 2)); // note) 0 <= char < 128
00183       }
00184       int m = s.size() % 4;
00185       for (int i = 0; i < m; i++) {
00186         v ^= s[4 * n + i] << (i * 8);
00187       }
00188       return v;
00189     }
00190   };
00191 
00192   struct MiniStringBag
00193   {
00194 #ifdef USE_HASH_MAP
00195     typedef __gnu_cxx::hash_map<std::string, int, hashfun_str> map_type;
00196 #else    
00197     typedef std::map<std::string, int> map_type;
00198 #endif
00199     int _size;
00200     map_type str2id;
00201     MiniStringBag() : _size(0) {}
00202     int Put(const std::string & i) {
00203       map_type::const_iterator j = str2id.find(i);
00204       if (j == str2id.end()) {
00205         int id = _size;
00206         _size++;
00207         str2id[i] = id;
00208         return id;
00209       }
00210       return j->second;
00211     }
00212     int Id(const std::string & i) const {
00213       map_type::const_iterator j = str2id.find(i);
00214       if (j == str2id.end())  return -1;
00215       return j->second;
00216     }
00217     int Size() const { return _size; }
00218     void Clear() { str2id.clear(); _size = 0; }
00219     map_type::const_iterator begin() const { return str2id.begin(); }
00220     map_type::const_iterator end()   const { return str2id.end(); }
00221   };
00222 
00223   struct StringBag : public MiniStringBag
00224   {
00225     std::vector<std::string> id2str;
00226     int Put(const std::string & i) {
00227       map_type::const_iterator j = str2id.find(i);
00228       if (j == str2id.end()) {
00229         int id = id2str.size();
00230         id2str.push_back(i);
00231         str2id[i] = id;
00232         return id;
00233       }
00234       return j->second;
00235     }
00236     std::string Str(const int id) const {
00237       assert(id >= 0 && id < (int)id2str.size());
00238       return id2str[id];
00239     }
00240     int Size() const { return id2str.size(); }
00241     void Clear() {
00242       str2id.clear();
00243       id2str.clear();
00244     }
00245   };
00246 
00247   std::vector<Sample> _vs; // vector of training_samples
00248   StringBag _label_bag;
00249   MiniStringBag _featurename_bag;
00250   double _sigma; // Gaussian prior
00251   double _inequality_width;
00252   std::vector<double> _vl;  // vector of lambda
00253   std::vector<double> _va;  // vector of alpha (for inequality ME)
00254   std::vector<double> _vb;  // vector of beta  (for inequality ME)
00255   ME_FeatureBag _fb;
00256   int _num_classes;
00257   std::vector<double> _vee;  // empirical expectation
00258   std::vector<double> _vme;  // empirical expectation
00259   std::vector< std::vector< int > > _feature2mef;
00260   std::vector< Sample > _heldout;
00261   double _train_error;   // current error rate on the training data
00262   double _heldout_error; // current error rate on the heldout data
00263   int _nheldout;
00264   int _early_stopping_n;
00265   std::vector<double> _vhlogl;
00266   const ME_Model * _ref_modelp;
00267 
00268   double heldout_likelihood();
00269   int conditional_probability(const Sample & nbs, std::vector<double> & membp) const;
00270   int make_feature_bag(const int cutoff);
00271   int classify(const Sample & nbs, std::vector<double> & membp) const;
00272   double update_model_expectation();
00273   int perform_LMVM();
00274   int perform_GIS(int C);
00275   void set_ref_dist(Sample & s) const;
00276   void init_feature2mef();
00277 
00278   // BLMVM
00279   /*
00280   int BLMVMComputeFunctionGradient(BLMVM blmvm, BLMVMVec X,double *f,BLMVMVec G);
00281   int BLMVMComputeBounds(BLMVM blmvm, BLMVMVec XL, BLMVMVec XU);
00282   int BLMVMSolve(double *x, int n);
00283   int BLMVMFunctionGradient(double *x, double *f, double *g, int n);
00284   int BLMVMLowerAndUpperBounds(double *xl,double *xu,int n);
00285   int Solve_BLMVM(BLMVM blmvm, BLMVMVec X);
00286   */
00287 };
00288 
00289 
00290 #endif
00291 
00292 
00293 /*
00294  * $Log: maxent.h,v $
00295  * Revision 1.1.1.1  2010/12/17 07:27:40  hccho
00296  *
00297  *
00298  * Revision 1.24  2006/08/21 17:30:38  tsuruoka
00299  * use MAX_LABEL_TYPES
00300  *
00301  * Revision 1.23  2006/07/25 13:19:53  tsuruoka
00302  * sort _vs[]
00303  *
00304  * Revision 1.22  2006/07/18 11:13:15  tsuruoka
00305  * modify comments
00306  *
00307  * Revision 1.21  2006/07/18 10:02:15  tsuruoka
00308  * remove sample2feature[]
00309  * speed up conditional_probability()
00310  *
00311  * Revision 1.20  2006/07/18 05:10:51  tsuruoka
00312  * add ref_dist
00313  *
00314  * Revision 1.19  2005/12/23 10:33:02  tsuruoka
00315  * support real-valued features
00316  *
00317  * Revision 1.18  2005/12/23 09:15:29  tsuruoka
00318  * modify _train to reduce memory consumption
00319  *
00320  * Revision 1.17  2005/10/28 13:02:34  tsuruoka
00321  * set_heldout(): add default value
00322  * Feature()
00323  *
00324  * Revision 1.16  2005/09/12 13:51:16  tsuruoka
00325  * Sample: list -> vector
00326  *
00327  * Revision 1.15  2005/09/12 13:27:10  tsuruoka
00328  * add add_training_sample()
00329  *
00330  * Revision 1.14  2005/04/27 11:22:27  tsuruoka
00331  * bugfix
00332  * ME_Sample: list -> vector
00333  *
00334  * Revision 1.13  2005/04/27 10:20:19  tsuruoka
00335  * MiniStringBag -> StringBag
00336  *
00337  * Revision 1.12  2005/04/27 10:00:42  tsuruoka
00338  * remove tmpfb
00339  *
00340  * Revision 1.11  2005/04/26 14:25:53  tsuruoka
00341  * add MiniStringBag, USE_HASH_MAP
00342  *
00343  * Revision 1.10  2004/10/04 05:50:25  tsuruoka
00344  * add Clear()
00345  *
00346  * Revision 1.9  2004/08/09 12:27:21  tsuruoka
00347  * change messages
00348  *
00349  * Revision 1.8  2004/08/04 13:55:19  tsuruoka
00350  * modify _sample2feature
00351  *
00352  * Revision 1.7  2004/07/29 05:51:13  tsuruoka
00353  * remove modeldata.h
00354  *
00355  * Revision 1.6  2004/07/28 13:42:58  tsuruoka
00356  * add AGIS
00357  *
00358  * Revision 1.5  2004/07/28 05:54:14  tsuruoka
00359  * get_class_name() -> get_class_label()
00360  * ME_Feature: bugfix
00361  *
00362  * Revision 1.4  2004/07/27 16:58:47  tsuruoka
00363  * modify the interface of classify()
00364  *
00365  * Revision 1.3  2004/07/26 17:23:46  tsuruoka
00366  * _sample2feature: list -> vector
00367  *
00368  * Revision 1.2  2004/07/26 15:49:23  tsuruoka
00369  * modify ME_Feature
00370  *
00371  * Revision 1.1  2004/07/26 13:10:55  tsuruoka
00372  * add files
00373  *
00374  * Revision 1.18  2004/07/22 08:34:45  tsuruoka
00375  * modify _sample2feature[]
00376  *
00377  * Revision 1.17  2004/07/21 16:33:01  tsuruoka
00378  * remove some comments
00379  *
00380  */
 All Classes Functions Variables