NERsuite
1.1.1
|
00001 /* 00002 * SentenceTagger matcher 00003 * 00004 * Copyright (c) 00005 * All rights reserved. 00006 * 00007 * Redistribution and use in source and binary forms, with or without 00008 * modification, are permitted provided that the following conditions are met: 00009 * * Redistributions of source code must retain the above copyright 00010 * notice, this list of conditions and the following disclaimer. 00011 * * Redistributions in binary form must reproduce the above copyright 00012 * notice, this list of conditions and the following disclaimer in the 00013 * documentation and/or other materials provided with the distribution. 00014 * * Neither the names of the authors nor the names of its contributors 00015 * may be used to endorse or promote products derived from this 00016 * software without specific prior written permission. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 */ 00030 #ifndef _SENTENCE_TAGGER_H 00031 #define _SENTENCE_TAGGER_H 00032 00033 #include <vector> 00034 #include <string> 00035 #include <iostream> 00036 #include "../nersuite_common/dictionary.h" 00037 #include "../nersuite_common/ne.h" 00038 00039 // Predefined column info. 00040 #define BEG_COL 0 00041 #define END_COL 1 00042 #define RAW_TOKEN_COL 2 00043 #define POS_COL 4 00044 00045 namespace NER 00046 { 00062 class SentenceTagger 00063 { 00064 private: 00065 typedef std::vector< std::vector<std::string> > V2_STR; 00066 00067 static size_t max_ne_len; 00068 static bool filter_NN; 00069 static int normalize_type; 00070 00071 // Sentence Data (Tokenized array) 00072 V2_STR m_Content; 00073 00074 std::vector<NE> v_ne; 00075 00076 std::vector<int> v_idx; 00077 00078 public: 00082 SentenceTagger(); 00083 00087 virtual ~SentenceTagger() {} 00088 00093 static void set_normalize_type(int nt) { normalize_type = nt; } 00094 00099 size_t size() const { return m_Content.size(); } 00100 00106 bool empty() const { return m_Content.empty(); } 00107 00111 V2_STR::iterator begin() { return m_Content.begin(); } 00112 00116 V2_STR::iterator end() { return m_Content.end(); } 00117 00121 V1_STR& operator[](size_t index) { return m_Content[index]; } 00122 00127 size_t read(std::istream &ifs); 00128 00133 void tag_nes(const Dictionary& dict); 00134 00135 private: 00136 void resolve_collision(); 00137 00138 void mark_ne(const Dictionary& dict); 00139 00140 int find_longest(size_t i_row, NE& ne, const Dictionary& dict) const; 00141 00142 int find_exact(size_t i_row, NE& ne, const Dictionary& dict) const; 00143 00144 size_t find_min_length(size_t i_row) const; 00145 }; 00146 } 00147 00148 #endif