NERsuite
1.1.1
|
00001 #ifndef _DICTIONARY_TEST 00002 #define _DICTIONARY_TEST 00003 00004 #include <string> 00005 #include <fstream> 00006 #include "../nersuite_common/dictionary.h" 00007 00008 using namespace std; 00009 using namespace NER; 00010 00011 static const char* DICTIONARY_TEST_TEXT_FILE = "dictionary_test.txt"; 00012 static const char* DICTIONARY_TEST_DB_FILE = "dictionary_test.cdbpp"; 00013 00014 void PrepareDictionaryTestData(const char* filename) 00015 { 00016 ofstream ofs(filename); 00017 ofs << "SampleEntry\tClass1\tClass2\tClass3" << endl; 00018 ofs << "S0a1m2p3l4e5E6n7t8r9y\tClass1\tClass4\tClass5" << endl; 00019 ofs << "S_a!m\"p#l$e%E&n\'t(r)y[S]a-m=p~l^e|E\\n@t[r]yS+a;m:p<l>e,E.n?t/r y\tClass6\tClass2\tClass7\n" << endl; 00020 ofs << "SampleEntry SampleEntry1 SampleEntry2\tClass8\tClass3\tClass9" << endl; 00021 ofs.close(); 00022 } 00023 00024 void TestDictionaryBuild_NoNormalization() 00025 { 00026 PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE); 00027 Dictionary dict(DICTIONARY_TEST_DB_FILE); 00028 dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeNone); 00029 00030 dict.open(); 00031 size_t count; 00032 00033 const int* classes = dict.get_classes("SampleEntry", NormalizeNone, &count); 00034 if (count != 3) 00035 throw new TestException("assert failed", __FILE__, __LINE__); 00036 if (dict.get_class_name(classes[0]) != "Class1") 00037 throw new TestException("assert failed", __FILE__, __LINE__); 00038 if (dict.get_class_name(classes[1]) != "Class2") 00039 throw new TestException("assert failed", __FILE__, __LINE__); 00040 if (dict.get_class_name(classes[2]) != "Class3") 00041 throw new TestException("assert failed", __FILE__, __LINE__); 00042 00043 classes = dict.get_classes("S0a1m2p3l4e5E6n7t8r9y", NormalizeNone, &count); 00044 if (count != 3) 00045 throw new TestException("assert failed", __FILE__, __LINE__); 00046 if (dict.get_class_name(classes[0]) != "Class1") 00047 throw new TestException("assert failed", __FILE__, __LINE__); 00048 if (dict.get_class_name(classes[1]) != "Class4") 00049 throw new TestException("assert failed", __FILE__, __LINE__); 00050 if (dict.get_class_name(classes[2]) != "Class5") 00051 throw new TestException("assert failed", __FILE__, __LINE__); 00052 00053 classes = dict.get_classes("S_a!m\"p#l$e%E&n\'t(r)y[S]a-m=p~l^e|E\\n@t[r]yS+a;m:p<l>e,E.n?t/r y", NormalizeNone, &count); 00054 if (count != 3) 00055 throw new TestException("assert failed", __FILE__, __LINE__); 00056 if (dict.get_class_name(classes[0]) != "Class6") 00057 throw new TestException("assert failed", __FILE__, __LINE__); 00058 if (dict.get_class_name(classes[1]) != "Class2") 00059 throw new TestException("assert failed", __FILE__, __LINE__); 00060 if (dict.get_class_name(classes[2]) != "Class7") 00061 throw new TestException("assert failed", __FILE__, __LINE__); 00062 00063 classes = dict.get_classes("SampleEntry SampleEntry1 SampleEntry2", NormalizeNone, &count); 00064 if (count != 3) 00065 throw new TestException("assert failed", __FILE__, __LINE__); 00066 if (dict.get_class_name(classes[0]) != "Class8") 00067 throw new TestException("assert failed", __FILE__, __LINE__); 00068 if (dict.get_class_name(classes[1]) != "Class3") 00069 throw new TestException("assert failed", __FILE__, __LINE__); 00070 if (dict.get_class_name(classes[2]) != "Class9") 00071 throw new TestException("assert failed", __FILE__, __LINE__); 00072 } 00073 00074 void TestDictionaryBuild_CaseNormalization() 00075 { 00076 PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE); 00077 Dictionary dict(DICTIONARY_TEST_DB_FILE); 00078 dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeCase); 00079 00080 dict.open(); 00081 size_t count; 00082 00083 const int* classes = dict.get_classes("sampleentry", NormalizeCase, &count); 00084 if (count != 3) 00085 throw new TestException("assert failed", __FILE__, __LINE__); 00086 if (dict.get_class_name(classes[0]) != "Class1") 00087 throw new TestException("assert failed", __FILE__, __LINE__); 00088 if (dict.get_class_name(classes[1]) != "Class2") 00089 throw new TestException("assert failed", __FILE__, __LINE__); 00090 if (dict.get_class_name(classes[2]) != "Class3") 00091 throw new TestException("assert failed", __FILE__, __LINE__); 00092 00093 classes = dict.get_classes("s0a1m2p3l4e5e6n7t8r9y", NormalizeCase, &count); 00094 if (count != 3) 00095 throw new TestException("assert failed", __FILE__, __LINE__); 00096 if (dict.get_class_name(classes[0]) != "Class1") 00097 throw new TestException("assert failed", __FILE__, __LINE__); 00098 if (dict.get_class_name(classes[1]) != "Class4") 00099 throw new TestException("assert failed", __FILE__, __LINE__); 00100 if (dict.get_class_name(classes[2]) != "Class5") 00101 throw new TestException("assert failed", __FILE__, __LINE__); 00102 00103 classes = dict.get_classes("s_a!m\"p#l$e%e&n\'t(r)y[s]a-m=p~l^e|e\\n@t[r]ys+a;m:p<l>e,e.n?t/r y", NormalizeCase, &count); 00104 if (count != 3) 00105 throw new TestException("assert failed", __FILE__, __LINE__); 00106 if (dict.get_class_name(classes[0]) != "Class6") 00107 throw new TestException("assert failed", __FILE__, __LINE__); 00108 if (dict.get_class_name(classes[1]) != "Class2") 00109 throw new TestException("assert failed", __FILE__, __LINE__); 00110 if (dict.get_class_name(classes[2]) != "Class7") 00111 throw new TestException("assert failed", __FILE__, __LINE__); 00112 00113 classes = dict.get_classes("sampleentry sampleentry1 sampleentry2", NormalizeCase, &count); 00114 if (count != 3) 00115 throw new TestException("assert failed", __FILE__, __LINE__); 00116 if (dict.get_class_name(classes[0]) != "Class8") 00117 throw new TestException("assert failed", __FILE__, __LINE__); 00118 if (dict.get_class_name(classes[1]) != "Class3") 00119 throw new TestException("assert failed", __FILE__, __LINE__); 00120 if (dict.get_class_name(classes[2]) != "Class9") 00121 throw new TestException("assert failed", __FILE__, __LINE__); 00122 } 00123 00124 void TestDictionaryBuild_CaseAndNumberNormalization() 00125 { 00126 PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE); 00127 Dictionary dict(DICTIONARY_TEST_DB_FILE); 00128 dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeCase|NormalizeNumber); 00129 00130 dict.open(); 00131 size_t count; 00132 00133 const int* classes = dict.get_classes("sampleentry", NormalizeCase|NormalizeNumber, &count); 00134 if (count != 3) 00135 throw new TestException("assert failed", __FILE__, __LINE__); 00136 if (dict.get_class_name(classes[0]) != "Class1") 00137 throw new TestException("assert failed", __FILE__, __LINE__); 00138 if (dict.get_class_name(classes[1]) != "Class2") 00139 throw new TestException("assert failed", __FILE__, __LINE__); 00140 if (dict.get_class_name(classes[2]) != "Class3") 00141 throw new TestException("assert failed", __FILE__, __LINE__); 00142 00143 classes = dict.get_classes("s0a0m0p0l0e0e0n0t0r0y", NormalizeCase|NormalizeNumber, &count); 00144 if (count != 3) 00145 throw new TestException("assert failed", __FILE__, __LINE__); 00146 if (dict.get_class_name(classes[0]) != "Class1") 00147 throw new TestException("assert failed", __FILE__, __LINE__); 00148 if (dict.get_class_name(classes[1]) != "Class4") 00149 throw new TestException("assert failed", __FILE__, __LINE__); 00150 if (dict.get_class_name(classes[2]) != "Class5") 00151 throw new TestException("assert failed", __FILE__, __LINE__); 00152 00153 classes = dict.get_classes("s_a!m\"p#l$e%e&n\'t(r)y[s]a-m=p~l^e|e\\n@t[r]ys+a;m:p<l>e,e.n?t/r y", NormalizeCase|NormalizeNumber, &count); 00154 if (count != 3) 00155 throw new TestException("assert failed", __FILE__, __LINE__); 00156 if (dict.get_class_name(classes[0]) != "Class6") 00157 throw new TestException("assert failed", __FILE__, __LINE__); 00158 if (dict.get_class_name(classes[1]) != "Class2") 00159 throw new TestException("assert failed", __FILE__, __LINE__); 00160 if (dict.get_class_name(classes[2]) != "Class7") 00161 throw new TestException("assert failed", __FILE__, __LINE__); 00162 00163 classes = dict.get_classes("sampleentry sampleentry0 sampleentry0", NormalizeCase|NormalizeNumber, &count); 00164 if (count != 3) 00165 throw new TestException("assert failed", __FILE__, __LINE__); 00166 if (dict.get_class_name(classes[0]) != "Class8") 00167 throw new TestException("assert failed", __FILE__, __LINE__); 00168 if (dict.get_class_name(classes[1]) != "Class3") 00169 throw new TestException("assert failed", __FILE__, __LINE__); 00170 if (dict.get_class_name(classes[2]) != "Class9") 00171 throw new TestException("assert failed", __FILE__, __LINE__); 00172 } 00173 00174 void TestDictionaryBuild_CaseAndNumberAndSymbolNormalization() 00175 { 00176 PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE); 00177 Dictionary dict(DICTIONARY_TEST_DB_FILE); 00178 dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeCase|NormalizeNumber|NormalizeSymbol); 00179 00180 dict.open(); 00181 size_t count; 00182 00183 const int* classes = dict.get_classes("sampleentry", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count); 00184 if (count != 3) 00185 throw new TestException("assert failed", __FILE__, __LINE__); 00186 if (dict.get_class_name(classes[0]) != "Class1") 00187 throw new TestException("assert failed", __FILE__, __LINE__); 00188 if (dict.get_class_name(classes[1]) != "Class2") 00189 throw new TestException("assert failed", __FILE__, __LINE__); 00190 if (dict.get_class_name(classes[2]) != "Class3") 00191 throw new TestException("assert failed", __FILE__, __LINE__); 00192 00193 classes = dict.get_classes("s0a0m0p0l0e0e0n0t0r0y", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count); 00194 if (count != 3) 00195 throw new TestException("assert failed", __FILE__, __LINE__); 00196 if (dict.get_class_name(classes[0]) != "Class1") 00197 throw new TestException("assert failed", __FILE__, __LINE__); 00198 if (dict.get_class_name(classes[1]) != "Class4") 00199 throw new TestException("assert failed", __FILE__, __LINE__); 00200 if (dict.get_class_name(classes[2]) != "Class5") 00201 throw new TestException("assert failed", __FILE__, __LINE__); 00202 00203 classes = dict.get_classes("s_a_m_p_l_e_e_n_t_r_y_s_a_m_p_l_e_e_n_t_r_ys_a_m_p_l_e_e_n_t_r_y", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count); 00204 if (count != 3) 00205 throw new TestException("assert failed", __FILE__, __LINE__); 00206 if (dict.get_class_name(classes[0]) != "Class6") 00207 throw new TestException("assert failed", __FILE__, __LINE__); 00208 if (dict.get_class_name(classes[1]) != "Class2") 00209 throw new TestException("assert failed", __FILE__, __LINE__); 00210 if (dict.get_class_name(classes[2]) != "Class7") 00211 throw new TestException("assert failed", __FILE__, __LINE__); 00212 00213 classes = dict.get_classes("sampleentry_sampleentry0_sampleentry0", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count); 00214 if (count != 3) 00215 throw new TestException("assert failed", __FILE__, __LINE__); 00216 if (dict.get_class_name(classes[0]) != "Class8") 00217 throw new TestException("assert failed", __FILE__, __LINE__); 00218 if (dict.get_class_name(classes[1]) != "Class3") 00219 throw new TestException("assert failed", __FILE__, __LINE__); 00220 if (dict.get_class_name(classes[2]) != "Class9") 00221 throw new TestException("assert failed", __FILE__, __LINE__); 00222 } 00223 00224 void TestDictionaryBuild_TokenizerNormalization() 00225 { 00226 PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE); 00227 Dictionary dict(DICTIONARY_TEST_DB_FILE); 00228 dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeToken); 00229 00230 dict.open(); 00231 size_t count; 00232 00233 const int* classes = dict.get_classes("SampleEntry", NormalizeToken, &count); 00234 if (count != 5) 00235 throw new TestException("assert failed", __FILE__, __LINE__); 00236 if (dict.get_class_name(classes[0]) != "Class1") 00237 throw new TestException("assert failed", __FILE__, __LINE__); 00238 if (dict.get_class_name(classes[1]) != "Class2") 00239 throw new TestException("assert failed", __FILE__, __LINE__); 00240 if (dict.get_class_name(classes[2]) != "Class3") 00241 throw new TestException("assert failed", __FILE__, __LINE__); 00242 if (dict.get_class_name(classes[3]) != "Class8") 00243 throw new TestException("assert failed", __FILE__, __LINE__); 00244 if (dict.get_class_name(classes[4]) != "Class9") 00245 throw new TestException("assert failed", __FILE__, __LINE__); 00246 00247 classes = dict.get_classes("SampleEntry1", NormalizeToken, &count); 00248 if (count != 3) 00249 throw new TestException("assert failed", __FILE__, __LINE__); 00250 if (dict.get_class_name(classes[0]) != "Class8") 00251 throw new TestException("assert failed", __FILE__, __LINE__); 00252 if (dict.get_class_name(classes[1]) != "Class3") 00253 throw new TestException("assert failed", __FILE__, __LINE__); 00254 if (dict.get_class_name(classes[2]) != "Class9") 00255 throw new TestException("assert failed", __FILE__, __LINE__); 00256 00257 classes = dict.get_classes("SampleEntry2", NormalizeToken, &count); 00258 if (count != 3) 00259 throw new TestException("assert failed", __FILE__, __LINE__); 00260 if (dict.get_class_name(classes[0]) != "Class8") 00261 throw new TestException("assert failed", __FILE__, __LINE__); 00262 if (dict.get_class_name(classes[1]) != "Class3") 00263 throw new TestException("assert failed", __FILE__, __LINE__); 00264 if (dict.get_class_name(classes[2]) != "Class9") 00265 throw new TestException("assert failed", __FILE__, __LINE__); 00266 00267 00268 } 00269 #endif