NERsuite  1.1.1
src/nersuite_common/tokenizer.h
00001 /*
00002 *      A sentence tokenizer class with a user-defined delimiters
00003 *
00004 * Copyright (c) 
00005 * All rights reserved.
00006 *
00007 * Redistribution and use in source and binary forms, with or without
00008 * modification, are permitted provided that the following conditions are met:
00009 *     * Redistributions of source code must retain the above copyright
00010 *       notice, this list of conditions and the following disclaimer.
00011 *     * Redistributions in binary form must reproduce the above copyright
00012 *       notice, this list of conditions and the following disclaimer in the
00013 *       documentation and/or other materials provided with the distribution.
00014 *     * Neither the names of the authors nor the names of its contributors
00015 *       may be used to endorse or promote products derived from this
00016 *       software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 */
00030 #ifndef _NERSUITE_TOKENIZER_H
00031 #define _NERSUITE_TOKENIZER_H
00032 
00033 #include <iostream>
00034 #include <string>
00035 #include <vector>
00036 #include <stdlib.h>
00037 
00038 namespace NER
00039 {
00040         typedef std::vector< std::string >      V1_STR;
00041         typedef std::vector< V1_STR >           V2_STR;
00042 
00047         class Tokenizer
00048         {
00049         private:
00050                 int splitter( const std::string &trimmed_sent, V2_STR &data );
00051                 void mark_pos( const std::string &raw_sent, V2_STR &data, size_t init_offset = 0 );
00052 
00053                 std::string trim_ws( const std::string &raw_sent );
00054                 size_t find_token_end( const std::string &trimmed_sent, const size_t beg );
00055 
00056         public:
00063                 int tokenize( const std::string &raw_sent, V2_STR &data, size_t init_offset = 0 );
00064 
00065         };
00066 }
00067 
00068 #endif
 All Classes Functions Variables