NERsuite
1.1.1
|
00001 /* 00002 * C++ implementation of Constant Database (CDB++) 00003 * 00004 * Copyright (c) 2008,2009, Naoaki Okazaki 00005 * All rights reserved. 00006 * 00007 * Redistribution and use in source and binary forms, with or without 00008 * modification, are permitted provided that the following conditions are met: 00009 * * Redistributions of source code must retain the above copyright 00010 * notice, this list of conditions and the following disclaimer. 00011 * * Redistributions in binary form must reproduce the above copyright 00012 * notice, this list of conditions and the following disclaimer in the 00013 * documentation and/or other materials provided with the distribution. 00014 * * Neither the name of the authors nor the names of its contributors may 00015 * be used to endorse or promote products derived from this software 00016 * without specific prior written permission. 00017 * 00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00021 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 00022 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00025 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00026 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00027 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 */ 00030 00031 /* $Id: cdbpp.h,v 1.1.1.1 2010/12/17 07:27:40 hccho Exp $ */ 00032 00033 #ifndef __CDBPP_H__ 00034 #define __CDBPP_H__ 00035 00036 #include <cstring> 00037 #include <fstream> 00038 #include <functional> 00039 #include <iostream> 00040 #include <vector> 00041 #include <stdint.h> 00042 #include <stdexcept> 00043 00044 namespace cdbpp 00045 { 00046 00054 // Global constants. 00055 enum { 00056 // Version number. 00057 VERSION = 1, 00058 // The number of hash tables. 00059 NUM_TABLES = 256, 00060 // A constant for byte-order checking. 00061 BYTEORDER_CHECK = 0x62445371, 00062 }; 00063 00064 00065 00066 00080 class murmurhash2 : 00081 public std::binary_function<const void *, size_t, uint32_t> 00082 { 00083 protected: 00084 inline static uint32_t get32bits(const char *d) 00085 { 00086 return *reinterpret_cast<const uint32_t*>(d); 00087 } 00088 00089 public: 00090 inline uint32_t operator() (const void *key, size_t size) const 00091 { 00092 // 'm' and 'r' are mixing constants generated offline. 00093 // They're not really 'magic', they just happen to work well. 00094 00095 const uint32_t m = 0x5bd1e995; 00096 const int32_t r = 24; 00097 00098 // Initialize the hash to a 'random' value 00099 00100 const uint32_t seed = 0x87654321; 00101 uint32_t h = seed ^ size; 00102 00103 // Mix 4 bytes at a time into the hash 00104 00105 const char * data = (const char *)key; 00106 00107 while (size >= 4) 00108 { 00109 uint32_t k = get32bits(data); 00110 00111 k *= m; 00112 k ^= k >> r; 00113 k *= m; 00114 00115 h *= m; 00116 h ^= k; 00117 00118 data += 4; 00119 size -= 4; 00120 } 00121 00122 // Handle the last few bytes of the input array 00123 00124 switch (size) 00125 { 00126 case 3: h ^= data[2] << 16; 00127 case 2: h ^= data[1] << 8; 00128 case 1: h ^= data[0]; 00129 h *= m; 00130 }; 00131 00132 // Do a few final mixes of the hash to ensure the last few 00133 // bytes are well-incorporated. 00134 00135 h ^= h >> 13; 00136 h *= m; 00137 h ^= h >> 15; 00138 00139 return h; 00140 } 00141 }; 00142 00143 00144 00145 00146 struct tableref_t 00147 { 00148 uint32_t offset; // Offset to a hash table. 00149 uint32_t num; // Number of elements in the hash table. 00150 }; 00151 00152 00153 static uint32_t get_data_begin() 00154 { 00155 return (16 + sizeof(tableref_t) * NUM_TABLES); 00156 } 00157 00158 00159 00163 class builder_exception : public std::invalid_argument 00164 { 00165 public: 00166 builder_exception(const std::string& msg) 00167 : std::invalid_argument(msg) 00168 { 00169 } 00170 }; 00171 00172 00173 00177 template <typename hash_function> 00178 class builder_base 00179 { 00180 protected: 00181 // A bucket structure. 00182 struct bucket 00183 { 00184 uint32_t hash; // Hash value of the record. 00185 uint32_t offset; // Offset address to the actual record. 00186 00187 bucket() : hash(0), offset(0) 00188 { 00189 } 00190 00191 bucket(uint32_t h, uint32_t o) : hash(h), offset(o) 00192 { 00193 } 00194 }; 00195 00196 // A hash table is a vector of buckets. 00197 typedef std::vector<bucket> hashtable; 00198 00199 protected: 00200 std::ofstream& m_os; // Output stream. 00201 uint32_t m_begin; 00202 uint32_t m_cur; 00203 hashtable m_ht[NUM_TABLES]; // Hash tables. 00204 00205 public: 00212 builder_base(std::ofstream& os) : m_os(os) 00213 { 00214 m_begin = m_os.tellp(); 00215 m_cur = get_data_begin(); 00216 m_os.seekp(m_begin + m_cur); 00217 } 00218 00222 virtual ~builder_base() 00223 { 00224 this->close(); 00225 } 00226 00236 template <class key_t, class value_t> 00237 void put(const key_t *key, size_t ksize, const value_t *value, size_t vsize) 00238 { 00239 // Write out the current record. 00240 write_uint32((uint32_t)ksize); 00241 m_os.write(reinterpret_cast<const char *>(key), ksize); 00242 write_uint32((uint32_t)vsize); 00243 m_os.write(reinterpret_cast<const char *>(value), vsize); 00244 00245 // Compute the hash value and choose a hash table. 00246 uint32_t hv = hash_function()(static_cast<const void *>(key), ksize); 00247 hashtable& ht = m_ht[hv % NUM_TABLES]; 00248 00249 // Store the hash value and offset to the hash table. 00250 ht.push_back(bucket(hv, m_cur)); 00251 00252 // Increment the current position. 00253 m_cur += sizeof(uint32_t) + ksize + sizeof(uint32_t) + vsize; 00254 } 00255 00256 protected: 00257 void close() 00258 { 00259 // Check the consistency of the stream offset. 00260 if (m_begin + m_cur != (uint32_t)m_os.tellp()) { 00261 throw builder_exception("Inconsistent stream offset"); 00262 } 00263 00264 // Store the hash tables. At this moment, the file pointer refers to 00265 // the offset succeeding the last key/value pair. 00266 for (size_t i = 0;i < NUM_TABLES;++i) { 00267 hashtable& ht = m_ht[i]; 00268 00269 // Do not write an empty hash table. 00270 if (!ht.empty()) { 00271 // An actual table will have the double size; half elements 00272 // in the table are kept empty. 00273 int n = ht.size() * 2; 00274 00275 // Allocate the actual table. 00276 bucket* dst = new bucket[n]; 00277 00278 // Put hash elements to the table with the open-address method. 00279 typename hashtable::const_iterator it; 00280 for (it = ht.begin();it != ht.end();++it) { 00281 int k = (it->hash >> 8) % n; 00282 00283 // Find a vacant element. 00284 while (dst[k].offset != 0) { 00285 k = (k+1) % n; 00286 } 00287 00288 // Store the hash element. 00289 dst[k].hash = it->hash; 00290 dst[k].offset = it->offset; 00291 } 00292 00293 // Write out the new table. 00294 for (int k = 0;k < n;++k) { 00295 write_uint32(dst[k].hash); 00296 write_uint32(dst[k].offset); 00297 } 00298 00299 // Free the table. 00300 delete[] dst; 00301 } 00302 } 00303 00304 // Store the current position. 00305 uint32_t offset = (uint32_t)m_os.tellp(); 00306 00307 // Rewind the stream position to the beginning. 00308 m_os.seekp(m_begin); 00309 00310 // Write the file header. 00311 char chunkid[4] = {'C','D','B','+'}; 00312 m_os.write(chunkid, 4); 00313 write_uint32(offset - m_begin); 00314 write_uint32(VERSION); 00315 write_uint32(BYTEORDER_CHECK); 00316 00317 // Write references to hash tables. At this moment, dbw->cur points 00318 // to the offset succeeding the last key/data pair. 00319 for (size_t i = 0;i < NUM_TABLES;++i) { 00320 // Offset to the hash table (or zero for non-existent tables). 00321 write_uint32(m_ht[i].empty() ? 0 : m_cur); 00322 // Bucket size is double to the number of elements. 00323 write_uint32(m_ht[i].size() * 2); 00324 // Advance the offset counter. 00325 m_cur += sizeof(uint32_t) * 2 * m_ht[i].size() * 2; 00326 } 00327 00328 // Seek to the last position. 00329 m_os.seekp(offset); 00330 } 00331 00332 inline void write_uint32(uint32_t value) 00333 { 00334 m_os.write(reinterpret_cast<const char *>(&value), sizeof(value)); 00335 } 00336 }; 00337 00338 00339 00343 class cdbpp_exception : public std::invalid_argument 00344 { 00345 public: 00346 cdbpp_exception(const std::string& msg) 00347 : std::invalid_argument(msg) 00348 { 00349 } 00350 }; 00351 00355 template <typename hash_function> 00356 class cdbpp_base 00357 { 00358 protected: 00359 struct bucket_t 00360 { 00361 uint32_t hash; // Hash value of the record. 00362 uint32_t offset; // Offset address to the actual record. 00363 }; 00364 00365 00366 struct hashtable_t 00367 { 00368 uint32_t num; // Number of elements in the table. 00369 const bucket_t* buckets; // Buckets (array of bucket). 00370 }; 00371 00372 00373 protected: 00374 const uint8_t* m_buffer; // Pointer to the memory block. 00375 size_t m_size; // Size of the memory block. 00376 bool m_own; // 00377 00378 hashtable_t m_ht[NUM_TABLES]; // Hash tables. 00379 size_t m_n; 00380 00381 public: 00385 cdbpp_base() 00386 : m_buffer(NULL), m_size(0), m_own(false), m_n(0) 00387 { 00388 } 00389 00397 cdbpp_base(const void *buffer, size_t size, bool own) 00398 : m_buffer(NULL), m_size(0), m_own(false), m_n(0) 00399 { 00400 this->open(buffer, size, own); 00401 } 00402 00408 cdbpp_base(std::ifstream& ifs) 00409 : m_buffer(NULL), m_size(0), m_own(false), m_n(0) 00410 { 00411 this->open(ifs); 00412 } 00413 00417 virtual ~cdbpp_base() 00418 { 00419 close(); 00420 } 00421 00427 bool is_open() const 00428 { 00429 return (m_buffer != NULL); 00430 } 00431 00436 size_t size() const 00437 { 00438 return m_n; 00439 } 00440 00446 bool empty() const 00447 { 00448 return (m_n == 0); 00449 } 00450 00456 size_t open(std::ifstream& ifs) 00457 { 00458 char chunk[4], size[4]; 00459 std::istream::pos_type offset = ifs.tellg(); 00460 00461 do { 00462 // Read a chunk identifier. 00463 ifs.read(chunk, 4); 00464 if (ifs.fail()) { 00465 break; 00466 } 00467 00468 // Check the chunk identifier. 00469 if (std::strncmp(chunk, "CDB+", 4) != 0) { 00470 break; 00471 } 00472 00473 // Read the size of the chunk. 00474 ifs.read(size, 4); 00475 if (ifs.fail()) { 00476 break; 00477 } 00478 00479 // Allocate a memory block for the chunk. 00480 uint32_t chunk_size = read_uint32(reinterpret_cast<uint8_t*>(size)); 00481 uint8_t* block = new uint8_t[chunk_size]; 00482 00483 // Read the memory image from the stream. 00484 ifs.seekg(offset, std::ios_base::beg); 00485 if (ifs.fail()) { 00486 break; 00487 } 00488 ifs.read(reinterpret_cast<char*>(block), chunk_size); 00489 if (ifs.fail()) { 00490 break; 00491 } 00492 00493 return this->open(block, chunk_size, true); 00494 00495 } while (0); 00496 00497 ifs.seekg(offset, std::ios::beg); 00498 return 0; 00499 } 00500 00508 size_t open(const void *buffer, size_t size, bool own = false) 00509 { 00510 const uint8_t *p = reinterpret_cast<const uint8_t*>(buffer); 00511 00512 // Make sure that the size of the chunk is larger than the minimum size. 00513 if (size < get_data_begin()) { 00514 throw cdbpp_exception("The memory image is smaller than a chunk header."); 00515 } 00516 00517 // Check the chunk identifier. 00518 if (memcmp(p, "CDB+", 4) != 0) { 00519 throw cdbpp_exception("Incorrect chunk header"); 00520 } 00521 p += 4; 00522 00523 // Read the chunk header. 00524 uint32_t csize = read_uint32(p); 00525 p += sizeof(uint32_t); 00526 uint32_t version = read_uint32(p); 00527 p += sizeof(uint32_t); 00528 uint32_t byteorder = read_uint32(p); 00529 p += sizeof(uint32_t); 00530 00531 // Check the byte-order consistency. 00532 if (byteorder != BYTEORDER_CHECK) { 00533 throw cdbpp_exception("Inconsistent byte order"); 00534 } 00535 // Check the chunk size. 00536 if (size < csize) { 00537 throw cdbpp_exception("The memory image is smaller than a chunk size."); 00538 } 00539 00540 // Set memory block and size. 00541 m_buffer = reinterpret_cast<const uint8_t*>(buffer); 00542 m_size = size; 00543 m_own = own; 00544 00545 // Set pointers to the hash tables. 00546 m_n = 0; 00547 const tableref_t* ref = reinterpret_cast<const tableref_t*>(p); 00548 for (size_t i = 0;i < NUM_TABLES;++i) { 00549 if (ref[i].offset) { 00550 // Set the buckets. 00551 m_ht[i].buckets = reinterpret_cast<const bucket_t*>(m_buffer + ref[i].offset); 00552 m_ht[i].num = ref[i].num; 00553 } else { 00554 // An empty hash table. 00555 m_ht[i].buckets = NULL; 00556 m_ht[i].num = 0; 00557 } 00558 00559 // The number of records is the half of the table size. 00560 m_n += (ref[i].num / 2); 00561 } 00562 00563 return (size_t)csize; 00564 } 00565 00569 void close() 00570 { 00571 if (m_own && m_buffer != NULL) { 00572 delete[] m_buffer; 00573 } 00574 m_buffer = NULL; 00575 m_size = 0; 00576 m_n = 0; 00577 } 00578 00587 const void* get(const void *key, size_t ksize, size_t* vsize) const 00588 { 00589 uint32_t hv = hash_function()(key, ksize); 00590 const hashtable_t* ht = &m_ht[hv % NUM_TABLES]; 00591 00592 if (ht->num && ht->buckets != NULL) { 00593 int n = ht->num; 00594 int k = (hv >> 8) % n; 00595 const bucket_t* p = NULL; 00596 00597 while (p = &ht->buckets[k], p->offset) { 00598 if (p->hash == hv) { 00599 const uint8_t *q = m_buffer + p->offset; 00600 if (read_uint32(q) == ksize && 00601 memcmp(key, q + sizeof(uint32_t), ksize) == 0) { 00602 q += sizeof(uint32_t) + ksize; 00603 if (vsize != NULL) { 00604 *vsize = read_uint32(q); 00605 } 00606 return q + sizeof(uint32_t); 00607 } 00608 } 00609 k = (k+1) % n; 00610 } 00611 } 00612 00613 if (vsize != NULL) { 00614 *vsize = 0; 00615 } 00616 return NULL; 00617 } 00618 00619 protected: 00620 inline uint32_t read_uint32(const uint8_t* p) const 00621 { 00622 return *reinterpret_cast<const uint32_t*>(p); 00623 } 00624 }; 00625 00627 typedef builder_base<murmurhash2> builder; 00629 typedef cdbpp_base<murmurhash2> cdbpp; 00630 00631 00632 }; 00633 00732 #endif/*__CDBPP_H__*/