NERsuite  1.1.1
src/nersuitetest/DictionaryTest.h
00001 #ifndef _DICTIONARY_TEST
00002 #define _DICTIONARY_TEST
00003 
00004 #include <string>
00005 #include <fstream>
00006 #include "../nersuite_common/dictionary.h"
00007 
00008 using namespace std;
00009 using namespace NER;
00010 
00011 static const char* DICTIONARY_TEST_TEXT_FILE = "dictionary_test.txt";
00012 static const char* DICTIONARY_TEST_DB_FILE = "dictionary_test.cdbpp";
00013 
00014 void PrepareDictionaryTestData(const char* filename)
00015 {
00016         ofstream ofs(filename);
00017         ofs << "SampleEntry\tClass1\tClass2\tClass3" << endl;
00018         ofs << "S0a1m2p3l4e5E6n7t8r9y\tClass1\tClass4\tClass5" << endl;
00019         ofs << "S_a!m\"p#l$e%E&n\'t(r)y[S]a-m=p~l^e|E\\n@t[r]yS+a;m:p<l>e,E.n?t/r y\tClass6\tClass2\tClass7\n" << endl;
00020         ofs << "SampleEntry SampleEntry1 SampleEntry2\tClass8\tClass3\tClass9" << endl;
00021         ofs.close();
00022 }
00023 
00024 void TestDictionaryBuild_NoNormalization()
00025 {
00026         PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE);
00027         Dictionary dict(DICTIONARY_TEST_DB_FILE);
00028         dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeNone);
00029         
00030         dict.open();
00031         size_t count;
00032 
00033         const int* classes = dict.get_classes("SampleEntry", NormalizeNone, &count);
00034         if (count != 3)
00035                 throw new TestException("assert failed", __FILE__, __LINE__);
00036         if (dict.get_class_name(classes[0]) != "Class1")
00037                 throw new TestException("assert failed", __FILE__, __LINE__);
00038         if (dict.get_class_name(classes[1]) != "Class2")
00039                 throw new TestException("assert failed", __FILE__, __LINE__);
00040         if (dict.get_class_name(classes[2]) != "Class3")
00041                 throw new TestException("assert failed", __FILE__, __LINE__);
00042 
00043         classes = dict.get_classes("S0a1m2p3l4e5E6n7t8r9y", NormalizeNone, &count);
00044         if (count != 3)
00045                 throw new TestException("assert failed", __FILE__, __LINE__);
00046         if (dict.get_class_name(classes[0]) != "Class1")
00047                 throw new TestException("assert failed", __FILE__, __LINE__);
00048         if (dict.get_class_name(classes[1]) != "Class4")
00049                 throw new TestException("assert failed", __FILE__, __LINE__);
00050         if (dict.get_class_name(classes[2]) != "Class5")
00051                 throw new TestException("assert failed", __FILE__, __LINE__);
00052 
00053         classes = dict.get_classes("S_a!m\"p#l$e%E&n\'t(r)y[S]a-m=p~l^e|E\\n@t[r]yS+a;m:p<l>e,E.n?t/r y", NormalizeNone, &count);
00054         if (count != 3)
00055                 throw new TestException("assert failed", __FILE__, __LINE__);
00056         if (dict.get_class_name(classes[0]) != "Class6")
00057                 throw new TestException("assert failed", __FILE__, __LINE__);
00058         if (dict.get_class_name(classes[1]) != "Class2")
00059                 throw new TestException("assert failed", __FILE__, __LINE__);
00060         if (dict.get_class_name(classes[2]) != "Class7")
00061                 throw new TestException("assert failed", __FILE__, __LINE__);
00062 
00063         classes = dict.get_classes("SampleEntry SampleEntry1 SampleEntry2", NormalizeNone, &count);
00064         if (count != 3)
00065                 throw new TestException("assert failed", __FILE__, __LINE__);
00066         if (dict.get_class_name(classes[0]) != "Class8")
00067                 throw new TestException("assert failed", __FILE__, __LINE__);
00068         if (dict.get_class_name(classes[1]) != "Class3")
00069                 throw new TestException("assert failed", __FILE__, __LINE__);
00070         if (dict.get_class_name(classes[2]) != "Class9")
00071                 throw new TestException("assert failed", __FILE__, __LINE__);
00072 }
00073 
00074 void TestDictionaryBuild_CaseNormalization()
00075 {
00076         PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE);
00077         Dictionary dict(DICTIONARY_TEST_DB_FILE);
00078         dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeCase);
00079         
00080         dict.open();
00081         size_t count;
00082 
00083         const int* classes = dict.get_classes("sampleentry", NormalizeCase, &count);
00084         if (count != 3)
00085                 throw new TestException("assert failed", __FILE__, __LINE__);
00086         if (dict.get_class_name(classes[0]) != "Class1")
00087                 throw new TestException("assert failed", __FILE__, __LINE__);
00088         if (dict.get_class_name(classes[1]) != "Class2")
00089                 throw new TestException("assert failed", __FILE__, __LINE__);
00090         if (dict.get_class_name(classes[2]) != "Class3")
00091                 throw new TestException("assert failed", __FILE__, __LINE__);
00092 
00093         classes = dict.get_classes("s0a1m2p3l4e5e6n7t8r9y", NormalizeCase, &count);
00094         if (count != 3)
00095                 throw new TestException("assert failed", __FILE__, __LINE__);
00096         if (dict.get_class_name(classes[0]) != "Class1")
00097                 throw new TestException("assert failed", __FILE__, __LINE__);
00098         if (dict.get_class_name(classes[1]) != "Class4")
00099                 throw new TestException("assert failed", __FILE__, __LINE__);
00100         if (dict.get_class_name(classes[2]) != "Class5")
00101                 throw new TestException("assert failed", __FILE__, __LINE__);
00102 
00103         classes = dict.get_classes("s_a!m\"p#l$e%e&n\'t(r)y[s]a-m=p~l^e|e\\n@t[r]ys+a;m:p<l>e,e.n?t/r y", NormalizeCase, &count);
00104         if (count != 3)
00105                 throw new TestException("assert failed", __FILE__, __LINE__);
00106         if (dict.get_class_name(classes[0]) != "Class6")
00107                 throw new TestException("assert failed", __FILE__, __LINE__);
00108         if (dict.get_class_name(classes[1]) != "Class2")
00109                 throw new TestException("assert failed", __FILE__, __LINE__);
00110         if (dict.get_class_name(classes[2]) != "Class7")
00111                 throw new TestException("assert failed", __FILE__, __LINE__);
00112 
00113         classes = dict.get_classes("sampleentry sampleentry1 sampleentry2", NormalizeCase, &count);
00114         if (count != 3)
00115                 throw new TestException("assert failed", __FILE__, __LINE__);
00116         if (dict.get_class_name(classes[0]) != "Class8")
00117                 throw new TestException("assert failed", __FILE__, __LINE__);
00118         if (dict.get_class_name(classes[1]) != "Class3")
00119                 throw new TestException("assert failed", __FILE__, __LINE__);
00120         if (dict.get_class_name(classes[2]) != "Class9")
00121                 throw new TestException("assert failed", __FILE__, __LINE__);
00122 }
00123 
00124 void TestDictionaryBuild_CaseAndNumberNormalization()
00125 {
00126         PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE);
00127         Dictionary dict(DICTIONARY_TEST_DB_FILE);
00128         dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeCase|NormalizeNumber);
00129         
00130         dict.open();
00131         size_t count;
00132 
00133         const int* classes = dict.get_classes("sampleentry", NormalizeCase|NormalizeNumber, &count);
00134         if (count != 3)
00135                 throw new TestException("assert failed", __FILE__, __LINE__);
00136         if (dict.get_class_name(classes[0]) != "Class1")
00137                 throw new TestException("assert failed", __FILE__, __LINE__);
00138         if (dict.get_class_name(classes[1]) != "Class2")
00139                 throw new TestException("assert failed", __FILE__, __LINE__);
00140         if (dict.get_class_name(classes[2]) != "Class3")
00141                 throw new TestException("assert failed", __FILE__, __LINE__);
00142 
00143         classes = dict.get_classes("s0a0m0p0l0e0e0n0t0r0y", NormalizeCase|NormalizeNumber, &count);
00144         if (count != 3)
00145                 throw new TestException("assert failed", __FILE__, __LINE__);
00146         if (dict.get_class_name(classes[0]) != "Class1")
00147                 throw new TestException("assert failed", __FILE__, __LINE__);
00148         if (dict.get_class_name(classes[1]) != "Class4")
00149                 throw new TestException("assert failed", __FILE__, __LINE__);
00150         if (dict.get_class_name(classes[2]) != "Class5")
00151                 throw new TestException("assert failed", __FILE__, __LINE__);
00152 
00153         classes = dict.get_classes("s_a!m\"p#l$e%e&n\'t(r)y[s]a-m=p~l^e|e\\n@t[r]ys+a;m:p<l>e,e.n?t/r y", NormalizeCase|NormalizeNumber, &count);
00154         if (count != 3)
00155                 throw new TestException("assert failed", __FILE__, __LINE__);
00156         if (dict.get_class_name(classes[0]) != "Class6")
00157                 throw new TestException("assert failed", __FILE__, __LINE__);
00158         if (dict.get_class_name(classes[1]) != "Class2")
00159                 throw new TestException("assert failed", __FILE__, __LINE__);
00160         if (dict.get_class_name(classes[2]) != "Class7")
00161                 throw new TestException("assert failed", __FILE__, __LINE__);
00162 
00163         classes = dict.get_classes("sampleentry sampleentry0 sampleentry0", NormalizeCase|NormalizeNumber, &count);
00164         if (count != 3)
00165                 throw new TestException("assert failed", __FILE__, __LINE__);
00166         if (dict.get_class_name(classes[0]) != "Class8")
00167                 throw new TestException("assert failed", __FILE__, __LINE__);
00168         if (dict.get_class_name(classes[1]) != "Class3")
00169                 throw new TestException("assert failed", __FILE__, __LINE__);
00170         if (dict.get_class_name(classes[2]) != "Class9")
00171                 throw new TestException("assert failed", __FILE__, __LINE__);
00172 }
00173 
00174 void TestDictionaryBuild_CaseAndNumberAndSymbolNormalization()
00175 {
00176         PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE);
00177         Dictionary dict(DICTIONARY_TEST_DB_FILE);
00178         dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeCase|NormalizeNumber|NormalizeSymbol);
00179         
00180         dict.open();
00181         size_t count;
00182 
00183         const int* classes = dict.get_classes("sampleentry", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count);
00184         if (count != 3)
00185                 throw new TestException("assert failed", __FILE__, __LINE__);
00186         if (dict.get_class_name(classes[0]) != "Class1")
00187                 throw new TestException("assert failed", __FILE__, __LINE__);
00188         if (dict.get_class_name(classes[1]) != "Class2")
00189                 throw new TestException("assert failed", __FILE__, __LINE__);
00190         if (dict.get_class_name(classes[2]) != "Class3")
00191                 throw new TestException("assert failed", __FILE__, __LINE__);
00192 
00193         classes = dict.get_classes("s0a0m0p0l0e0e0n0t0r0y", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count);
00194         if (count != 3)
00195                 throw new TestException("assert failed", __FILE__, __LINE__);
00196         if (dict.get_class_name(classes[0]) != "Class1")
00197                 throw new TestException("assert failed", __FILE__, __LINE__);
00198         if (dict.get_class_name(classes[1]) != "Class4")
00199                 throw new TestException("assert failed", __FILE__, __LINE__);
00200         if (dict.get_class_name(classes[2]) != "Class5")
00201                 throw new TestException("assert failed", __FILE__, __LINE__);
00202 
00203         classes = dict.get_classes("s_a_m_p_l_e_e_n_t_r_y_s_a_m_p_l_e_e_n_t_r_ys_a_m_p_l_e_e_n_t_r_y", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count);
00204         if (count != 3)
00205                 throw new TestException("assert failed", __FILE__, __LINE__);
00206         if (dict.get_class_name(classes[0]) != "Class6")
00207                 throw new TestException("assert failed", __FILE__, __LINE__);
00208         if (dict.get_class_name(classes[1]) != "Class2")
00209                 throw new TestException("assert failed", __FILE__, __LINE__);
00210         if (dict.get_class_name(classes[2]) != "Class7")
00211                 throw new TestException("assert failed", __FILE__, __LINE__);
00212 
00213         classes = dict.get_classes("sampleentry_sampleentry0_sampleentry0", NormalizeCase|NormalizeNumber|NormalizeSymbol, &count);
00214         if (count != 3)
00215                 throw new TestException("assert failed", __FILE__, __LINE__);
00216         if (dict.get_class_name(classes[0]) != "Class8")
00217                 throw new TestException("assert failed", __FILE__, __LINE__);
00218         if (dict.get_class_name(classes[1]) != "Class3")
00219                 throw new TestException("assert failed", __FILE__, __LINE__);
00220         if (dict.get_class_name(classes[2]) != "Class9")
00221                 throw new TestException("assert failed", __FILE__, __LINE__);
00222 }
00223 
00224 void TestDictionaryBuild_TokenizerNormalization()
00225 {
00226         PrepareDictionaryTestData(DICTIONARY_TEST_TEXT_FILE);
00227         Dictionary dict(DICTIONARY_TEST_DB_FILE);
00228         dict.build(DICTIONARY_TEST_TEXT_FILE, NormalizeToken);
00229         
00230         dict.open();
00231         size_t count;
00232 
00233         const int* classes = dict.get_classes("SampleEntry", NormalizeToken, &count);
00234         if (count != 5)
00235                 throw new TestException("assert failed", __FILE__, __LINE__);
00236         if (dict.get_class_name(classes[0]) != "Class1")
00237                 throw new TestException("assert failed", __FILE__, __LINE__);
00238         if (dict.get_class_name(classes[1]) != "Class2")
00239                 throw new TestException("assert failed", __FILE__, __LINE__);
00240         if (dict.get_class_name(classes[2]) != "Class3")
00241                 throw new TestException("assert failed", __FILE__, __LINE__);
00242         if (dict.get_class_name(classes[3]) != "Class8")
00243                 throw new TestException("assert failed", __FILE__, __LINE__);
00244         if (dict.get_class_name(classes[4]) != "Class9")
00245                 throw new TestException("assert failed", __FILE__, __LINE__);
00246 
00247         classes = dict.get_classes("SampleEntry1", NormalizeToken, &count);
00248         if (count != 3)
00249                 throw new TestException("assert failed", __FILE__, __LINE__);
00250         if (dict.get_class_name(classes[0]) != "Class8")
00251                 throw new TestException("assert failed", __FILE__, __LINE__);
00252         if (dict.get_class_name(classes[1]) != "Class3")
00253                 throw new TestException("assert failed", __FILE__, __LINE__);
00254         if (dict.get_class_name(classes[2]) != "Class9")
00255                 throw new TestException("assert failed", __FILE__, __LINE__);
00256 
00257         classes = dict.get_classes("SampleEntry2", NormalizeToken, &count);
00258         if (count != 3)
00259                 throw new TestException("assert failed", __FILE__, __LINE__);
00260         if (dict.get_class_name(classes[0]) != "Class8")
00261                 throw new TestException("assert failed", __FILE__, __LINE__);
00262         if (dict.get_class_name(classes[1]) != "Class3")
00263                 throw new TestException("assert failed", __FILE__, __LINE__);
00264         if (dict.get_class_name(classes[2]) != "Class9")
00265                 throw new TestException("assert failed", __FILE__, __LINE__);
00266 
00267 
00268 }
00269 #endif
 All Classes Functions Variables