NERsuite
1.1.1
|
00001 /* 00002 * NERSuite 00003 * Feature extraction functionalities 00004 * 00005 * Copyright (c) 00006 * All rights reserved. 00007 * 00008 * Redistribution and use in source and binary forms, with or without 00009 * modification, are permitted provided that the following conditions are met: 00010 * * Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * * Redistributions in binary form must reproduce the above copyright 00013 * notice, this list of conditions and the following disclaimer in the 00014 * documentation and/or other materials provided with the distribution. 00015 * * Neither the names of the authors nor the names of its contributors 00016 * may be used to endorse or promote products derived from this 00017 * software without specific prior written permission. 00018 * 00019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00020 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00021 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00022 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 00023 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00024 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00025 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00026 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00027 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00028 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00029 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 */ 00031 00032 #ifndef _FEXTOR_H 00033 #define _FEXTOR_H 00034 00035 00036 #include <iostream> 00037 #include <string> 00038 #include <vector> 00039 #include <set> 00040 #include <algorithm> 00041 #include <cstdlib> 00042 00043 #include "typedefs.h" 00044 #include "../nersuite_common/string_utils.h" 00045 00046 00047 namespace NER 00048 { 00053 class FeatureExtractor 00054 { 00055 public: 00059 FeatureExtractor(const COLUMN_INFO& col_info); 00060 00065 void ext_WORD_feats(const V2_STR &one_sent, V2_STR &sent_feats); 00066 00071 void ext_LEMMA_feats(const V2_STR &one_sent, V2_STR &sent_feats); 00072 00077 void ext_ORTHO_feats(const V2_STR &one_sent, V2_STR &sent_feats); 00078 00083 void ext_POS_feats(const V2_STR &one_sent, V2_STR &sent_feats); 00084 00089 void ext_LEMMA_POS_feats(const V2_STR &one_sent, V2_STR &sent_feats); 00090 00095 void ext_CHUNK_feats(const V2_STR &one_sent, V2_STR &sent_feats); 00096 00102 void ext_DIC_feats(const V2_STR &one_sent, V2_STR &sent_feats, int opt_dic); 00103 00104 private: 00105 void get_n_grams(const std::string &token, const int n, std::vector<std::string> &ngrams); 00106 std::string get_item(const V2_STR &one_sent, const V2_STR_citr &i_row, const int col, const int rel_pos); 00107 void find_chunk_range(const V2_STR &one_sent, const V2_STR_citr &i_row, std::pair<V2_STR_citr, V2_STR_citr> &chk_range); 00108 00109 std::set<std::string> greek_alphabets; 00110 const COLUMN_INFO& COL_INFO; 00111 }; 00112 00113 inline FeatureExtractor::FeatureExtractor(const COLUMN_INFO& col_info) 00114 : COL_INFO(col_info) 00115 { 00116 std::string g_alphabets[] = { "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", 00117 "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega" }; 00118 greek_alphabets.insert(g_alphabets, g_alphabets + 24); 00119 } 00120 } 00121 00122 #endif 00123 00124 00125