NERsuite
1.1.1
|
00001 /* 00002 * dictionary library 00003 * 00004 * Copyright (c) 00005 * All rights reserved. 00006 * 00007 * Redistribution and use in source and binary forms, with or without 00008 * modification, are permitted provided that the following conditions are met: 00009 * * Redistributions of source code must retain the above copyright 00010 * notice, this list of conditions and the following disclaimer. 00011 * * Redistributions in binary form must reproduce the above copyright 00012 * notice, this list of conditions and the following disclaimer in the 00013 * documentation and/or other materials provided with the distribution. 00014 * * Neither the names of the authors nor the names of its contributors 00015 * may be used to endorse or promote products derived from this 00016 * software without specific prior written permission. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 */ 00030 #ifndef _DICTIONARY_COMPILER_H 00031 #define _DICTIONARY_COMPILER_H 00032 00033 #include <string> 00034 #include <vector> 00035 #include <map> 00036 #include <fstream> 00037 #include <exception> 00038 #include "../cdbpp-1.1/include/cdbpp.h" 00039 #include "nersuite_exception.h" 00040 #include "ne.h" 00041 00042 typedef std::vector<std::string> V1_STR; 00043 00044 00045 namespace NER 00046 { 00053 enum NormalizeType 00054 { 00058 NormalizeNone = 0, 00062 NormalizeCase = 0x01, 00066 NormalizeNumber = 0x02, 00070 NormalizeSymbol = 0x04, 00074 NormalizeToken = 0x08, 00075 }; 00076 00081 class Dictionary 00082 { 00083 public: 00086 const std::string db_path; 00087 00088 private: 00089 std::ifstream db_ifs; 00090 cdbpp::cdbpp db_reader; 00091 std::map< int, std::string > map_idx2name; 00092 00093 public: 00097 Dictionary(const char* binary_dbname); 00098 00101 virtual ~Dictionary(); 00102 00105 void open(); 00106 00114 const int* get_classes(const std::string& key, int normalize_type, size_t* count) const; 00115 00120 size_t get_class_count() const; 00121 00127 const std::string& get_class_name(int class_index) const; 00128 00133 void build(const char* txt_dbname, int normalize_type); 00134 00135 private: 00136 void normalize(const std::string& form, int normalize_type, V1_STR& normalized_tokens); 00137 00138 void load_index_mapping(); 00139 }; 00140 } 00141 #endif