NERsuite  1.1.1
src/nersuite/FExtor.h
00001 /*
00002 *      NERSuite
00003 *      Feature extraction functionalities
00004 *
00005 * Copyright (c) 
00006 * All rights reserved.
00007 *
00008 * Redistribution and use in source and binary forms, with or without
00009 * modification, are permitted provided that the following conditions are met:
00010 *     * Redistributions of source code must retain the above copyright
00011 *       notice, this list of conditions and the following disclaimer.
00012 *     * Redistributions in binary form must reproduce the above copyright
00013 *       notice, this list of conditions and the following disclaimer in the
00014 *       documentation and/or other materials provided with the distribution.
00015 *     * Neither the names of the authors nor the names of its contributors
00016 *       may be used to endorse or promote products derived from this
00017 *       software without specific prior written permission.
00018 *
00019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00020 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00021 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00022 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
00023 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00026 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00027 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00028 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00029 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 */
00031 
00032 #ifndef _FEXTOR_H
00033 #define _FEXTOR_H
00034 
00035 
00036 #include <iostream>
00037 #include <string>
00038 #include <vector>
00039 #include <set>
00040 #include <algorithm>
00041 #include <cstdlib>
00042 
00043 #include "typedefs.h"
00044 #include "../nersuite_common/string_utils.h"
00045 
00046 
00047 namespace NER
00048 {
00053         class FeatureExtractor
00054         {
00055         public:
00059                 FeatureExtractor(const COLUMN_INFO& col_info);
00060 
00065                 void ext_WORD_feats(const V2_STR &one_sent, V2_STR &sent_feats);
00066 
00071                 void ext_LEMMA_feats(const V2_STR &one_sent, V2_STR &sent_feats);
00072 
00077                 void ext_ORTHO_feats(const V2_STR &one_sent, V2_STR &sent_feats);
00078 
00083                 void ext_POS_feats(const V2_STR &one_sent, V2_STR &sent_feats);
00084 
00089                 void ext_LEMMA_POS_feats(const V2_STR &one_sent, V2_STR &sent_feats);
00090 
00095                 void ext_CHUNK_feats(const V2_STR &one_sent, V2_STR &sent_feats);
00096 
00102                 void ext_DIC_feats(const V2_STR &one_sent, V2_STR &sent_feats, int opt_dic);
00103 
00104         private:
00105                 void get_n_grams(const std::string &token, const int n, std::vector<std::string> &ngrams);
00106                 std::string get_item(const V2_STR &one_sent, const V2_STR_citr &i_row, const int col, const int rel_pos);
00107                 void find_chunk_range(const V2_STR &one_sent, const V2_STR_citr &i_row, std::pair<V2_STR_citr, V2_STR_citr> &chk_range);
00108 
00109                 std::set<std::string> greek_alphabets;
00110                 const COLUMN_INFO&      COL_INFO;
00111         };
00112 
00113         inline FeatureExtractor::FeatureExtractor(const COLUMN_INFO& col_info)
00114                 : COL_INFO(col_info)
00115         {
00116                 std::string g_alphabets[] = { "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", 
00117                         "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega" };
00118                 greek_alphabets.insert(g_alphabets, g_alphabets + 24);
00119         }
00120 }
00121 
00122 #endif
00123 
00124 
00125 
 All Classes Functions Variables