NERsuite  1.1.1
src/nersuite_common/dictionary.h
00001 /*
00002 *      dictionary library
00003 *
00004 * Copyright (c) 
00005 * All rights reserved.
00006 *
00007 * Redistribution and use in source and binary forms, with or without
00008 * modification, are permitted provided that the following conditions are met:
00009 *     * Redistributions of source code must retain the above copyright
00010 *       notice, this list of conditions and the following disclaimer.
00011 *     * Redistributions in binary form must reproduce the above copyright
00012 *       notice, this list of conditions and the following disclaimer in the
00013 *       documentation and/or other materials provided with the distribution.
00014 *     * Neither the names of the authors nor the names of its contributors
00015 *       may be used to endorse or promote products derived from this
00016 *       software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 */
00030 #ifndef _DICTIONARY_COMPILER_H
00031 #define _DICTIONARY_COMPILER_H
00032 
00033 #include <string>
00034 #include <vector>
00035 #include <map>
00036 #include <fstream>
00037 #include <exception>
00038 #include "../cdbpp-1.1/include/cdbpp.h"
00039 #include "nersuite_exception.h"
00040 #include "ne.h"
00041 
00042 typedef std::vector<std::string>                V1_STR;
00043 
00044 
00045 namespace NER
00046 {
00053         enum NormalizeType
00054         {
00058                 NormalizeNone = 0,
00062                 NormalizeCase = 0x01,
00066                 NormalizeNumber = 0x02,
00070                 NormalizeSymbol = 0x04,
00074                 NormalizeToken = 0x08,
00075         };
00076 
00081         class Dictionary
00082         {
00083         public:
00086                 const std::string       db_path;
00087 
00088         private:
00089                 std::ifstream   db_ifs;
00090                 cdbpp::cdbpp    db_reader;
00091                 std::map< int, std::string > map_idx2name;
00092 
00093         public:
00097                 Dictionary(const char* binary_dbname);
00098 
00101                 virtual ~Dictionary();
00102 
00105                 void open();
00106 
00114                 const int* get_classes(const std::string& key, int normalize_type, size_t* count) const;
00115 
00120                 size_t get_class_count() const;
00121 
00127                 const std::string& get_class_name(int class_index) const;
00128 
00133                 void build(const char* txt_dbname, int normalize_type);
00134 
00135         private:
00136                 void normalize(const std::string& form, int normalize_type, V1_STR& normalized_tokens);
00137 
00138                 void load_index_mapping();
00139         };
00140 }
00141 #endif
 All Classes Functions Variables