NERsuite  1.1.1
src/dictionary_tagger/sentence_tagger.h
00001 /*
00002 *      SentenceTagger matcher
00003 *
00004 * Copyright (c) 
00005 * All rights reserved.
00006 *
00007 * Redistribution and use in source and binary forms, with or without
00008 * modification, are permitted provided that the following conditions are met:
00009 *     * Redistributions of source code must retain the above copyright
00010 *       notice, this list of conditions and the following disclaimer.
00011 *     * Redistributions in binary form must reproduce the above copyright
00012 *       notice, this list of conditions and the following disclaimer in the
00013 *       documentation and/or other materials provided with the distribution.
00014 *     * Neither the names of the authors nor the names of its contributors
00015 *       may be used to endorse or promote products derived from this
00016 *       software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 */
00030 #ifndef _SENTENCE_TAGGER_H
00031 #define _SENTENCE_TAGGER_H
00032 
00033 #include <vector>
00034 #include <string>
00035 #include <iostream>
00036 #include "../nersuite_common/dictionary.h"
00037 #include "../nersuite_common/ne.h"
00038 
00039 // Predefined column info.
00040 #define         BEG_COL                 0
00041 #define         END_COL                 1
00042 #define         RAW_TOKEN_COL   2
00043 #define         POS_COL                 4
00044 
00045 namespace NER
00046 {
00062         class SentenceTagger
00063         {
00064         private:
00065                 typedef std::vector< std::vector<std::string> > V2_STR;
00066 
00067                 static size_t   max_ne_len;
00068                 static bool             filter_NN;
00069                 static int              normalize_type;
00070 
00071                 // Sentence Data (Tokenized array)
00072                 V2_STR  m_Content;
00073 
00074                 std::vector<NE>         v_ne;
00075                 
00076                 std::vector<int>        v_idx;
00077 
00078         public:
00082                 SentenceTagger();
00083 
00087                 virtual ~SentenceTagger() {}
00088 
00093                 static void set_normalize_type(int nt) { normalize_type = nt; }
00094 
00099                 size_t  size() const { return m_Content.size(); }
00100 
00106                 bool    empty() const { return m_Content.empty(); }
00107 
00111                 V2_STR::iterator        begin() { return m_Content.begin(); }
00112 
00116                 V2_STR::iterator        end() { return m_Content.end(); }
00117 
00121                 V1_STR& operator[](size_t index) { return m_Content[index]; }
00122 
00127                 size_t  read(std::istream &ifs);
00128 
00133                 void    tag_nes(const Dictionary& dict);
00134 
00135         private:
00136                 void    resolve_collision();
00137 
00138                 void    mark_ne(const Dictionary& dict);
00139 
00140                 int             find_longest(size_t i_row, NE& ne, const Dictionary& dict) const;
00141 
00142                 int             find_exact(size_t i_row, NE& ne, const Dictionary& dict) const;
00143 
00144                 size_t  find_min_length(size_t i_row) const;
00145         };
00146 }
00147 
00148 #endif
 All Classes Functions Variables