NERsuite
1.1.1
|
00001 /* 00002 * A sentence tokenizer class with a user-defined delimiters 00003 * 00004 * Copyright (c) 00005 * All rights reserved. 00006 * 00007 * Redistribution and use in source and binary forms, with or without 00008 * modification, are permitted provided that the following conditions are met: 00009 * * Redistributions of source code must retain the above copyright 00010 * notice, this list of conditions and the following disclaimer. 00011 * * Redistributions in binary form must reproduce the above copyright 00012 * notice, this list of conditions and the following disclaimer in the 00013 * documentation and/or other materials provided with the distribution. 00014 * * Neither the names of the authors nor the names of its contributors 00015 * may be used to endorse or promote products derived from this 00016 * software without specific prior written permission. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 */ 00030 #ifndef _NERSUITE_TOKENIZER_H 00031 #define _NERSUITE_TOKENIZER_H 00032 00033 #include <iostream> 00034 #include <string> 00035 #include <vector> 00036 #include <stdlib.h> 00037 00038 namespace NER 00039 { 00040 typedef std::vector< std::string > V1_STR; 00041 typedef std::vector< V1_STR > V2_STR; 00042 00049 class Tokenizer 00050 { 00051 private: 00052 int splitter( const std::string &trimmed_sent, V2_STR &data ); 00053 void mark_pos( const std::string &raw_sent, V2_STR &data, size_t init_offset = 0 ); 00054 00055 std::string trim_ws( const std::string &raw_sent ); 00056 size_t find_token_end( const std::string &trimmed_sent, const size_t beg ); 00057 00058 public: 00065 int tokenize( const std::string &raw_sent, V2_STR &data, size_t init_offset = 0 ); 00066 00067 }; 00068 } 00069 00070 #endif