From 28659827c3cc1c8854520500b89fc21e86fc7850 Mon Sep 17 00:00:00 2001 From: sowle Date: Wed, 28 Aug 2019 17:58:35 +0300 Subject: [PATCH] lmdb: reverting back to v 18 due to complex issues with resizing/synchronization lmdb v 24 requires manual resizing/growing during it's normal function Zano core is purely async so it's not easy to prevent all DB txs from starting on lmdb adapter level, because it will lead to random deadlocks in the core due to many high-level cross-thread dependencies. We will rethink this later. Many thanks to @leo-yuriev who helped us to discover these issues! --- contrib/db/liblmdb/CHANGES | 60 +- contrib/db/liblmdb/COPYRIGHT | 2 +- contrib/db/liblmdb/Doxyfile | 2 +- contrib/db/liblmdb/Makefile | 11 +- contrib/db/liblmdb/intro.doc | 2 +- contrib/db/liblmdb/lmdb.h | 78 +- contrib/db/liblmdb/mdb.c | 2282 ++++++++++++++-------- contrib/db/liblmdb/mdb_copy.1 | 11 +- contrib/db/liblmdb/mdb_copy.c | 6 +- contrib/db/liblmdb/mdb_dump.1 | 10 +- contrib/db/liblmdb/mdb_dump.c | 19 +- contrib/db/liblmdb/mdb_load.1 | 11 +- contrib/db/liblmdb/mdb_load.c | 64 +- contrib/db/liblmdb/mdb_stat.1 | 10 +- contrib/db/liblmdb/mdb_stat.c | 33 +- contrib/db/liblmdb/midl.c | 65 +- contrib/db/liblmdb/midl.h | 22 +- contrib/db/liblmdb/mtest.c | 2 +- contrib/db/liblmdb/mtest2.c | 2 +- contrib/db/liblmdb/mtest3.c | 2 +- contrib/db/liblmdb/mtest4.c | 2 +- contrib/db/liblmdb/mtest5.c | 2 +- contrib/db/liblmdb/mtest6.c | 2 +- contrib/db/liblmdb/sample-bdb.txt | 2 +- contrib/db/liblmdb/sample-mdb.txt | 2 +- src/common/db_abstract_accessor.h | 12 +- src/common/db_backend_base.h | 23 +- src/currency_core/blockchain_storage.cpp | 22 +- src/currency_core/currency_config.h | 6 +- src/currency_core/tx_pool.cpp | 19 +- src/daemon/daemon.cpp | 2 +- tests/unit_tests/db_accessors.cpp | 9 +- tests/unit_tests/lmdb_tests.cpp | 2 +- 33 files changed, 1806 insertions(+), 993 deletions(-) diff --git a/contrib/db/liblmdb/CHANGES b/contrib/db/liblmdb/CHANGES index f00efbb9..aabb1adf 100644 --- a/contrib/db/liblmdb/CHANGES +++ b/contrib/db/liblmdb/CHANGES @@ -1,70 +1,14 @@ LMDB 0.9 Change Log -LMDB 0.9.24 Release (2019/07/24) - ITS#8969 Tweak mdb_page_split - ITS#8975 WIN32 fix writemap set_mapsize crash - ITS#9007 Fix loose pages in WRITEMAP - -LMDB 0.9.23 Release (2018/12/19) - ITS#8756 Fix loose pages in dirty list - ITS#8831 Fix mdb_load flag init - ITS#8844 Fix mdb_env_close in forked process - Documentation - ITS#8857 mdb_cursor_del doesn't invalidate cursor - ITS#8908 GET_MULTIPLE etc don't change passed in key - -LMDB 0.9.22 Release (2018/03/22) - Fix MDB_DUPSORT alignment bug (ITS#8819) - Fix regression with new db from 0.9.19 (ITS#8760) - Fix liblmdb to build on Solaris (ITS#8612) - Fix delete behavior with DUPSORT DB (ITS#8622) - Fix mdb_cursor_get/mdb_cursor_del behavior (ITS#8722) - -LMDB 0.9.21 Release (2017/06/01) - Fix xcursor after cursor_del (ITS#8622) - -LMDB 0.9.20 (Withdrawn) - Fix mdb_load with escaped plaintext (ITS#8558) - Fix mdb_cursor_last / mdb_put interaction (ITS#8557) - -LMDB 0.9.19 Release (2016/12/28) - Fix mdb_env_cwalk cursor init (ITS#8424) - Fix robust mutexes on Solaris 10/11 (ITS#8339) - Tweak Win32 error message buffer - Fix MDB_GET_BOTH on non-dup record (ITS#8393) - Optimize mdb_drop - Fix xcursors after mdb_cursor_del (ITS#8406) - Fix MDB_NEXT_DUP after mdb_cursor_del (ITS#8412) - Fix mdb_cursor_put resetting C_EOF (ITS#8489) - Fix mdb_env_copyfd2 to return EPIPE on SIGPIPE (ITS#8504) - Fix mdb_env_copy with empty DB (ITS#8209) - Fix behaviors with fork (ITS#8505) - Fix mdb_dbi_open with mainDB cursors (ITS#8542) - Fix robust mutexes on kFreeBSD (ITS#8554) - Fix utf8_to_utf16 error checks (ITS#7992) - Fix F_NOCACHE on MacOS, error is non-fatal (ITS#7682) - Build - Make shared lib suffix overridable (ITS#8481) - Documentation - Cleanup doxygen nits - Note reserved vs actual mem/disk usage - - -LMDB 0.9.18 Release (2016/02/05) +LMDB 0.9.18 Release Engineering Fix robust mutex detection on glibc 2.10-11 (ITS#8330) - Fix page_search_root assert on FreeDB (ITS#8336) - Fix MDB_APPENDDUP vs. rewrite(single item) (ITS#8334) - Fix mdb_copy of large files on Windows - Fix subcursor move after delete (ITS#8355) - Fix mdb_midl_shirnk off-by-one (ITS#8363) Check for utf8_to_utf16 failures (ITS#7992) Catch strdup failure in mdb_dbi_open Build Additional makefile var tweaks (ITS#8169) Documentation Add Getting Started page - Update WRITEMAP description - + LMDB 0.9.17 Release (2015/11/30) Fix ITS#7377 catch calloc failure diff --git a/contrib/db/liblmdb/COPYRIGHT b/contrib/db/liblmdb/COPYRIGHT index f076556e..722d1a51 100644 --- a/contrib/db/liblmdb/COPYRIGHT +++ b/contrib/db/liblmdb/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright 2011-2019 Howard Chu, Symas Corp. +Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/Doxyfile b/contrib/db/liblmdb/Doxyfile index 5ca2cfe8..5047c0bb 100644 --- a/contrib/db/liblmdb/Doxyfile +++ b/contrib/db/liblmdb/Doxyfile @@ -253,7 +253,7 @@ IDL_PROPERTY_SUPPORT = YES # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. -DISTRIBUTE_GROUP_DOC = YES +DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a diff --git a/contrib/db/liblmdb/Makefile b/contrib/db/liblmdb/Makefile index f254511f..f3c93a2f 100644 --- a/contrib/db/liblmdb/Makefile +++ b/contrib/db/liblmdb/Makefile @@ -8,7 +8,7 @@ # platforms; you should not need to change any of these. # Read their descriptions in mdb.c if you do: # -# - MDB_USE_POSIX_SEM +# - MDB_USE_POSIX_MUTEX, MDB_USE_POSIX_SEM, MDB_USE_SYSV_SEM # - MDB_DSYNC # - MDB_FDATASYNC # - MDB_FDATASYNC_WORKS @@ -24,9 +24,8 @@ W = -W -Wall -Wno-unused-parameter -Wbad-function-cast -Wuninitialized THREADS = -pthread OPT = -O2 -g CFLAGS = $(THREADS) $(OPT) $(W) $(XCFLAGS) -LDLIBS = -SOLIBS = -SOEXT = .so +LDLIBS = # -lntdll # Windows needs ntdll +SOLIBS = # -lntdll prefix = /usr/local exec_prefix = $(prefix) bindir = $(exec_prefix)/bin @@ -38,7 +37,7 @@ mandir = $(datarootdir)/man ######################################################################## IHDRS = lmdb.h -ILIBS = liblmdb.a liblmdb$(SOEXT) +ILIBS = liblmdb.a liblmdb.so IPROGS = mdb_stat mdb_copy mdb_dump mdb_load IDOCS = mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1 PROGS = $(IPROGS) mtest mtest2 mtest3 mtest4 mtest5 @@ -64,7 +63,7 @@ test: all liblmdb.a: mdb.o midl.o $(AR) rs $@ mdb.o midl.o -liblmdb$(SOEXT): mdb.lo midl.lo +liblmdb.so: mdb.lo midl.lo # $(CC) $(LDFLAGS) -pthread -shared -Wl,-Bsymbolic -o $@ mdb.o midl.o $(SOLIBS) $(CC) $(LDFLAGS) -pthread -shared -o $@ mdb.lo midl.lo $(SOLIBS) diff --git a/contrib/db/liblmdb/intro.doc b/contrib/db/liblmdb/intro.doc index 64dfcaad..870c7bb8 100644 --- a/contrib/db/liblmdb/intro.doc +++ b/contrib/db/liblmdb/intro.doc @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Howard Chu, Symas Corp. + * Copyright 2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/lmdb.h b/contrib/db/liblmdb/lmdb.h index 2f55290d..0bca3eb7 100644 --- a/contrib/db/liblmdb/lmdb.h +++ b/contrib/db/liblmdb/lmdb.h @@ -53,14 +53,15 @@ * * Fix: Check for stale readers periodically, using the * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. - * Stale writers will be cleared automatically on some systems: + * Stale writers will be cleared automatically on most systems: * - Windows - automatic + * - BSD, systems using SysV semaphores - automatic * - Linux, systems using POSIX mutexes with Robust option - automatic - * - not on BSD, systems using POSIX semaphores. * Otherwise just make all programs using the database close it; * the lockfile is always reset on first open of the environment. * - * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * - On BSD systems or others configured with MDB_USE_SYSV_SEM or + * MDB_USE_POSIX_SEM, * startup can fail due to semaphores owned by another userid. * * Fix: Open and close the database as the user which owns the @@ -96,12 +97,11 @@ * transactions. Each transaction belongs to one thread. See below. * The #MDB_NOTLS flag changes this for read-only transactions. * - * - Use an MDB_env* in the process which opened it, not after fork(). + * - Use an MDB_env* in the process which opened it, without fork()ing. * * - Do not have open an LMDB database twice in the same process at * the same time. Not even from a plain open() call - close()ing it - * breaks fcntl() advisory locking. (It is OK to reopen it after - * fork() - exec*(), since the lockfile has FD_CLOEXEC set.) + * breaks flock() advisory locking. * * - Avoid long-lived transactions. Read transactions prevent * reuse of pages freed by newer write transactions, thus the @@ -135,7 +135,7 @@ * * @author Howard Chu, Symas Corporation. * - * @copyright Copyright 2011-2019 Howard Chu, Symas Corp. All rights reserved. + * @copyright Copyright 2011-2016 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -166,6 +166,7 @@ #define _LMDB_H_ #include +#include #ifdef __cplusplus extern "C" { @@ -178,6 +179,13 @@ typedef int mdb_mode_t; typedef mode_t mdb_mode_t; #endif +#ifdef MDB_VL32 +typedef uint64_t mdb_size_t; +#define mdb_env_create mdb_env_create_vl32 /**< Prevent mixing with non-VL32 builds */ +#else +typedef size_t mdb_size_t; +#endif + /** An abstraction for a file handle. * On POSIX systems file handles are small integers. On Windows * they're opaque pointers. @@ -200,7 +208,7 @@ typedef int mdb_filehandle_t; /** Library minor version */ #define MDB_VERSION_MINOR 9 /** Library patch version */ -#define MDB_VERSION_PATCH 24 +#define MDB_VERSION_PATCH 70 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) @@ -210,7 +218,7 @@ typedef int mdb_filehandle_t; MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "July 24, 2019" +#define MDB_VERSION_DATE "December 19, 2015" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" @@ -303,6 +311,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NORDAHEAD 0x800000 /** don't initialize malloc'd memory before writing to datafile */ #define MDB_NOMEMINIT 0x1000000 + /** use the previous snapshot rather than the latest one */ +#define MDB_PREVSNAPSHOT 0x2000000 /** @} */ /** @defgroup mdb_dbi_open Database Flags @@ -370,7 +380,7 @@ typedef enum MDB_cursor_op { MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ MDB_GET_CURRENT, /**< Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /**< Return up to a page of duplicate data items + MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items from current cursor position. Move cursor to prepare for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ MDB_LAST, /**< Position at last key/data item */ @@ -379,7 +389,7 @@ typedef enum MDB_cursor_op { MDB_NEXT, /**< Position at next data item */ MDB_NEXT_DUP, /**< Position at next data item of current key. Only for #MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /**< Return up to a page of duplicate data items + MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items from next cursor position. Move cursor to prepare for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ MDB_NEXT_NODUP, /**< Position at first data item of next key */ @@ -390,7 +400,7 @@ typedef enum MDB_cursor_op { MDB_SET, /**< Position at specified key */ MDB_SET_KEY, /**< Position at specified key, return key + data */ MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ - MDB_PREV_MULTIPLE /**< Position at previous page and return up to + MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to a page of duplicate data items. Only for #MDB_DUPFIXED */ } MDB_cursor_op; @@ -457,18 +467,18 @@ typedef struct MDB_stat { unsigned int ms_psize; /**< Size of a database page. This is currently the same for all databases. */ unsigned int ms_depth; /**< Depth (height) of the B-tree */ - size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /**< Number of leaf pages */ - size_t ms_overflow_pages; /**< Number of overflow pages */ - size_t ms_entries; /**< Number of data items */ + mdb_size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + mdb_size_t ms_leaf_pages; /**< Number of leaf pages */ + mdb_size_t ms_overflow_pages; /**< Number of overflow pages */ + mdb_size_t ms_entries; /**< Number of data items */ } MDB_stat; /** @brief Information about the environment */ typedef struct MDB_envinfo { void *me_mapaddr; /**< Address of map, if fixed */ - size_t me_mapsize; /**< Size of the data memory map */ - size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ + mdb_size_t me_mapsize; /**< Size of the data memory map */ + mdb_size_t me_last_pgno; /**< ID of the last used page */ + mdb_size_t me_last_txnid; /**< ID of the last committed transaction */ unsigned int me_maxreaders; /**< max reader slots in the environment */ unsigned int me_numreaders; /**< max reader slots used in the environment */ } MDB_envinfo; @@ -614,6 +624,12 @@ int mdb_env_create(MDB_env **env); * caller is expected to overwrite all of the memory that was * reserved in that case. * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_PREVSNAPSHOT + * Open the environment with the previous snapshot rather than the latest + * one. This loses the latest transaction, but may help work around some + * types of corruption. If opened with write access, this must be the + * only process using the environment. This flag is automatically reset + * after a write transaction is successfully committed. * * @param[in] mode The UNIX permissions to set on created files and semaphores. * This parameter is ignored on Windows. @@ -680,7 +696,6 @@ int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free * pages and sequentially renumber all pages in output. This option * consumes more CPU and runs more slowly than the default. - * Currently it fails if the environment has suffered a page leak. * * @return A non-zero error value on failure and 0 on success. */ @@ -795,10 +810,6 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags); int mdb_env_get_path(MDB_env *env, const char **path); /** @brief Return the filedescriptor for the given environment. - * - * This function may be called after fork(), so the descriptor can be - * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. - * (Until LMDB 0.9.18, only the lockfile had that.) * * @param[in] env An environment handle returned by #mdb_env_create() * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. @@ -842,7 +853,7 @@ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); * an active write transaction. * */ -int mdb_env_set_mapsize(MDB_env *env, size_t size); +int mdb_env_set_mapsize(MDB_env *env, mdb_size_t size); /** @brief Set the maximum number of threads/reader slots for the environment. * @@ -955,6 +966,10 @@ int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); *
      *
    • #MDB_RDONLY * This transaction will not perform any write operations. + *
    • #MDB_NOSYNC + * Don't flush system buffers to disk when committing this transaction. + *
    • #MDB_NOMETASYNC + * Flush system buffers but omit metadata flush when committing this transaction. *
    * @param[out] txn Address where the new #MDB_txn handle will be stored * @return A non-zero error value on failure and 0 on success. Some possible @@ -987,7 +1002,7 @@ MDB_env *mdb_txn_env(MDB_txn *txn); * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @return A transaction ID, valid if input is an active transaction. */ -size_t mdb_txn_id(MDB_txn *txn); +mdb_size_t mdb_txn_id(MDB_txn *txn); /** @brief Commit all the operations of a transaction into the database. * @@ -1103,9 +1118,8 @@ int mdb_txn_renew(MDB_txn *txn); * This flag may only be used in combination with #MDB_DUPSORT. This option * tells the library that the data items for this database are all the same * size, which allows further optimizations in storage and retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE - * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple - * items at once. + * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE + * cursor operations may be used to retrieve multiple items at once. *
  • #MDB_INTEGERDUP * This option specifies that duplicate data items are binary integers, * similar to #MDB_INTEGERKEY keys. @@ -1510,10 +1524,6 @@ int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, /** @brief Delete current key/data pair * * This function deletes the key/data pair to which the cursor refers. - * This does not invalidate the cursor, so operations such as MDB_NEXT - * can still be used on it. - * Both MDB_NEXT and MDB_GET_CURRENT will return the same record after - * this operation. * @param[in] cursor A cursor handle returned by #mdb_cursor_open() * @param[in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. @@ -1542,7 +1552,7 @@ int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. * */ -int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); +int mdb_cursor_count(MDB_cursor *cursor, mdb_size_t *countp); /** @brief Compare two data items according to a particular database. * diff --git a/contrib/db/liblmdb/mdb.c b/contrib/db/liblmdb/mdb.c index 692feaa3..ca9f3b12 100644 --- a/contrib/db/liblmdb/mdb.c +++ b/contrib/db/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2019 Howard Chu, Symas Corp. + * Copyright 2011-2016 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,13 +35,50 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif -#if defined(__WIN64__) +#if defined(MDB_VL32) || defined(__WIN64__) #define _FILE_OFFSET_BITS 64 #endif #ifdef _WIN32 #include #include -#include /* get wcscpy() */ +#include + +#ifndef _NTDEF_ +typedef _Return_type_success_(return >= 0) LONG NTSTATUS; +typedef NTSTATUS *PNTSTATUS; +#endif + +/** Visual studio big files support +*/ + +/* We use native NT APIs to setup the memory map, so that we can + * let the DB file grow incrementally instead of always preallocating + * the full size. These APIs are defined in and + * but those headers are meant for driver-level development and + * conflict with the regular user-level headers, so we explicitly + * declare them here. Using these APIs also means we must link to + * ntdll.dll, which is not linked by default in user code. + */ +NTSTATUS WINAPI +NtCreateSection(OUT PHANDLE sh, IN ACCESS_MASK acc, + IN void * oa OPTIONAL, + IN PLARGE_INTEGER ms OPTIONAL, + IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL); + +typedef enum _SECTION_INHERIT { + ViewShare = 1, + ViewUnmap = 2 +} SECTION_INHERIT; + +NTSTATUS WINAPI +NtMapViewOfSection(IN PHANDLE sh, IN HANDLE ph, + IN OUT PVOID *addr, IN ULONG_PTR zbits, + IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL, + IN OUT PSIZE_T vs, IN SECTION_INHERIT ih, + IN ULONG at, IN ULONG pp); + +NTSTATUS WINAPI +NtClose(HANDLE h); /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it * as int64 which is wrong. MSVC doesn't define it at all, so just @@ -93,6 +130,13 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define BROKEN_FDATASYNC #endif +#ifdef _WIN32 +typedef int64_t off64_t; +#else +typedef off_t off64_t; +#endif + + #include #include #include @@ -109,14 +153,10 @@ typedef SSIZE_T ssize_t; #include #endif -#if defined(__sun) || defined(ANDROID) +#if defined(__sun) || defined(__ANDROID__) /* Most platforms have posix_memalign, older may only have memalign */ #define HAVE_MEMALIGN 1 #include -/* On Solaris, we need the POSIX sigwait function */ -#if defined (__sun) -# define _POSIX_PTHREAD_SEMANTICS 1 -#endif #endif #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) @@ -124,25 +164,36 @@ typedef SSIZE_T ssize_t; #include /* defines BYTE_ORDER on HPUX and Solaris */ #endif -#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) -# define MDB_USE_POSIX_SEM 1 +#if defined(__APPLE__) || defined (BSD) +# if !(defined(MDB_USE_POSIX_MUTEX) || defined(MDB_USE_POSIX_SEM)) +# define MDB_USE_SYSV_SEM 1 +# endif # define MDB_FDATASYNC fsync -#elif defined(ANDROID) +#elif defined(__ANDROID__) # define MDB_FDATASYNC fsync #endif #ifndef _WIN32 #include -#include #ifdef MDB_USE_POSIX_SEM # define MDB_USE_HASH 1 #include +#elif defined(MDB_USE_SYSV_SEM) +#include +#include +#ifdef _SEM_SEMUN_UNDEFINED +union semun { + int val; + struct semid_ds *buf; + unsigned short *array; +}; +#endif /* _SEM_SEMUN_UNDEFINED */ #else #define MDB_USE_POSIX_MUTEX 1 -#endif -#endif +#endif /* MDB_USE_POSIX_SEM */ +#endif /* !_WIN32 */ -#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ +#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) + defined(MDB_USE_SYSV_SEM) \ + defined(MDB_USE_POSIX_MUTEX) != 1 # error "Ambiguous shared-lock implementation" #endif @@ -244,6 +295,8 @@ typedef SSIZE_T ssize_t; #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) #ifdef _WIN32 #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) +#elif defined MDB_USE_SYSV_SEM +#define MDB_OWNERDEAD (MDB_LAST_ERRCODE + 11) #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ #endif @@ -254,31 +307,28 @@ typedef SSIZE_T ssize_t; /** Some platforms define the EOWNERDEAD error code * even though they don't support Robust Mutexes. * Compile with -DMDB_USE_ROBUST=0, or use some other - * mechanism like -DMDB_USE_POSIX_SEM instead of - * -DMDB_USE_POSIX_MUTEX. - * (Posix semaphores are not robust.) + * mechanism like -DMDB_USE_SYSV_SEM instead of + * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are + * also Robust, but some systems don't support them + * either.) */ #ifndef MDB_USE_ROBUST /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ -# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ +# if defined(MDB_USE_POSIX_MUTEX) && (defined(__ANDROID__) || \ (defined(__GLIBC__) && GLIBC_VER < 0x020004)) # define MDB_USE_ROBUST 0 # else # define MDB_USE_ROBUST 1 -# endif -#endif /* !MDB_USE_ROBUST */ - -#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) /* glibc < 2.12 only provided _np API */ -# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ - (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) +# if defined(__GLIBC__) && GLIBC_VER < 0x02000c # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) # endif -#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ +# endif +#endif /* MDB_USE_ROBUST */ -#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) +#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST #define MDB_ROBUST_SUPPORTED 1 #endif @@ -301,10 +351,8 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) #define pthread_cond_signal(x) SetEvent(*x) #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) -#define THREAD_CREATE(thr,start,arg) \ - (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) -#define THREAD_FINISH(thr) \ - (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) +#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) +#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) #define mdb_mutex_consistent(mutex) 0 @@ -344,16 +392,50 @@ mdb_sem_wait(sem_t *sem) return rc; } +#elif defined MDB_USE_SYSV_SEM + +typedef struct mdb_mutex { + int semid; + int semnum; + int *locked; +} mdb_mutex_t[1], *mdb_mutexref_t; + +#define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) +#define UNLOCK_MUTEX(mutex) do { \ + struct sembuf sb = { 0, 1, SEM_UNDO }; \ + sb.sem_num = (mutex)->semnum; \ + *(mutex)->locked = 0; \ + semop((mutex)->semid, &sb, 1); \ +} while(0) + +static int +mdb_sem_wait(mdb_mutexref_t sem) +{ + int rc, *locked = sem->locked; + struct sembuf sb = { 0, -1, SEM_UNDO }; + sb.sem_num = sem->semnum; + do { + if (!semop(sem->semid, &sb, 1)) { + rc = *locked ? MDB_OWNERDEAD : MDB_SUCCESS; + *locked = 1; + break; + } + } while ((rc = errno) == EINTR); + return rc; +} + +#define mdb_mutex_consistent(mutex) 0 + #else /* MDB_USE_POSIX_MUTEX: */ - /** Shared mutex/semaphore as the original is stored. + /** Shared mutex/semaphore as it is stored (mdb_mutex_t), and as + * local variables keep it (mdb_mutexref_t). * - * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. - * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it - * is array[size 1] so it can be assigned to the pointer. + * An mdb_mutex_t can be assigned to an mdb_mutexref_t. They can + * be the same, or an array[size 1] and a pointer. + * @{ */ -typedef pthread_mutex_t mdb_mutex_t[1]; - /** Reference to an #mdb_mutex_t */ -typedef pthread_mutex_t *mdb_mutexref_t; +typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; + /* @} */ /** Lock the reader or writer mutex. * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). */ @@ -364,7 +446,7 @@ typedef pthread_mutex_t *mdb_mutexref_t; /** Mark mutex-protected data as repaired, after death of previous owner. */ #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) -#endif /* MDB_USE_POSIX_SEM */ +#endif /* MDB_USE_POSIX_SEM || MDB_USE_SYSV_SEM */ /** Get the error code for the last failed system function. */ @@ -389,12 +471,30 @@ typedef pthread_mutex_t *mdb_mutexref_t; #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif + #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) #define MNAME_LEN 32 +#elif defined(MDB_USE_SYSV_SEM) +#define MNAME_LEN (sizeof(int)) #else #define MNAME_LEN (sizeof(pthread_mutex_t)) #endif +#ifdef MDB_USE_SYSV_SEM +#define SYSV_SEM_FLAG 1 /**< SysV sems in lockfile format */ +#else +#define SYSV_SEM_FLAG 0 +#endif + /** @} */ #ifdef MDB_ROBUST_SUPPORTED @@ -536,7 +636,7 @@ static txnid_t mdb_debug_start; /** The version number for a database's datafile format. */ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION 1 +#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) /** @brief The max size of a key we can write, or 0 for computed max. * @@ -725,14 +825,6 @@ typedef struct MDB_txbody { uint32_t mtb_magic; /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ uint32_t mtb_format; -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) - char mtb_rmname[MNAME_LEN]; -#else - /** Mutex protecting access to this table. - * This is the reader table lock used with LOCK_MUTEX(). - */ - mdb_mutex_t mtb_rmutex; -#endif /** The ID of the last transaction committed to the database. * This is recorded here only for convenience; the value can always * be determined by reading the main database meta pages. @@ -743,6 +835,17 @@ typedef struct MDB_txbody { * when readers release their slots. */ volatile unsigned mtb_numreaders; +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mtb_rmname[MNAME_LEN]; +#elif defined(MDB_USE_SYSV_SEM) + int mtb_semid; + int mtb_rlocked; +#else + /** Mutex protecting access to this table. + * This is the reader table lock used with LOCK_MUTEX(). + */ + mdb_mutex_t mtb_rmutex; +#endif } MDB_txbody; /** The actual reader table definition. */ @@ -755,12 +858,19 @@ typedef struct MDB_txninfo { #define mti_rmname mt1.mtb.mtb_rmname #define mti_txnid mt1.mtb.mtb_txnid #define mti_numreaders mt1.mtb.mtb_numreaders +#ifdef MDB_USE_SYSV_SEM +#define mti_semid mt1.mtb.mtb_semid +#define mti_rlocked mt1.mtb.mtb_rlocked +#endif char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; union { #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) char mt2_wmname[MNAME_LEN]; #define mti_wmname mt2.mt2_wmname +#elif defined MDB_USE_SYSV_SEM + int mt2_wlocked; +#define mti_wlocked mt2.mt2_wlocked #else mdb_mutex_t mt2_wmutex; #define mti_wmutex mt2.mt2_wmutex @@ -775,26 +885,13 @@ typedef struct MDB_txninfo { ((uint32_t) \ ((MDB_LOCK_VERSION) \ /* Flags which describe functionality */ \ + + (SYSV_SEM_FLAG << 18) \ + (((MDB_PIDLOCK) != 0) << 16))) /** @} */ -/** Common header for all page types. The page type depends on #mp_flags. - * - * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with - * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages - * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. - * - * #P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of #F_BIGDATA nodes. - * - * #P_SUBP sub-pages are small leaf "pages" with duplicate data. - * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. - * (Duplicate data can also go in sub-databases, which use normal pages.) - * - * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. - * - * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once - * in the snapshot: Either used by a database or listed in a freeDB record. +/** Common header for all page types. + * Overflow records occupy a number of contiguous pages with no + * headers on any page after the first. */ typedef struct MDB_page { #define mp_pgno mp_p.p_pgno @@ -803,7 +900,7 @@ typedef struct MDB_page { pgno_t p_pgno; /**< page number */ struct MDB_page *p_next; /**< for in-memory list of freed pages */ } mp_p; - uint16_t mp_pad; /**< key size if this is a LEAF2 page */ + uint16_t mp_pad; /** @defgroup mdb_page Page Flags * @ingroup internal * Flags for the page headers. @@ -870,34 +967,25 @@ typedef struct MDB_page { /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - /** Link in #MDB_txn.%mt_loose_pgs list. - * Kept outside the page header, which is needed when reusing the page. - */ + /** Link in #MDB_txn.%mt_loose_pgs list */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) /** Header for a single key/data pair within a page. * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. - * - * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used - * for pgno. (Branch nodes have no flags). Lo and hi are in host byte - * order in case some accesses can be optimized to 32-bit word access. - * - * Leaf node flags describe node contents. #F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just #F_SUBDATA). */ typedef struct MDB_node { - /** part of data size or pgno - * @{ */ + /** lo and hi are used for data size on leaf nodes and for + * child pgno on branch nodes. On 64 bit platforms, flags + * is also used for pgno. (Branch nodes have no flags). + * They are in host byte order in case that lets some + * accesses be optimized into a 32-bit word access. + */ #if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; + unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ #else unsigned short mn_hi, mn_lo; #endif - /** @} */ /** @defgroup mdb_node Node Flags * @ingroup internal * Flags for node headers. @@ -1003,13 +1091,13 @@ typedef struct MDB_db { pgno_t md_branch_pages; /**< number of internal pages */ pgno_t md_leaf_pages; /**< number of leaf pages */ pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ + mdb_size_t md_entries; /**< number of data items */ pgno_t md_root; /**< the root page of this tree */ } MDB_db; + /** mdb_dbi_open flags */ #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) - /** #mdb_dbi_open() flags */ #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) @@ -1033,17 +1121,22 @@ typedef struct MDB_meta { uint32_t mm_magic; /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ uint32_t mm_version; +#ifdef MDB_VL32 + union { /* always zero since we don't support fixed mapping in MDB_VL32 */ + MDB_ID mmun_ull; + void *mmun_address; + } mm_un; +#define mm_address mm_un.mmun_address +#else void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ +#endif + pgno_t mm_mapsize; /**< size of mmap region */ MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ /** The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_pad /** Any persistent environment flags. @ref mdb_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags - /** Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. - */ - pgno_t mm_last_pg; + pgno_t mm_last_pg; /**< last used page in file */ volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -1080,6 +1173,9 @@ struct MDB_txn { /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ MDB_txn *mt_child; pgno_t mt_next_pgno; /**< next unallocated page */ +#ifdef MDB_VL32 + pgno_t mt_last_pgno; /**< last written page */ +#endif /** The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. @@ -1093,7 +1189,7 @@ struct MDB_txn { * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; - /** Number of loose pages (#mt_loose_pgs) */ + /* #Number of loose pages (#mt_loose_pgs) */ int mt_loose_count; /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are @@ -1116,17 +1212,29 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; +#ifdef MDB_VL32 + /** List of read-only pages (actually chunks) */ + MDB_ID3L mt_rpages; + /** We map chunks of 16 pages. Even though Windows uses 4KB pages, all + * mappings must begin on 64KB boundaries. So we round off all pgnos to + * a chunk boundary. We do the same on Linux for symmetry, and also to + * reduce the frequency of mmap/munmap calls. + */ +#define MDB_RPAGE_CHUNK 16 +#define MDB_TRPAGE_SIZE 4096 /**< size of #mt_rpages array of chunks */ +#define MDB_TRPAGE_MAX (MDB_TRPAGE_SIZE-1) /**< maximum chunk index */ + unsigned int mt_rpcheck; /**< threshold for reclaiming unref'd chunks */ +#endif /** Number of DB records in use, or 0 when the txn is finished. * This number only ever increments until the txn finishes; we * don't decrement it when individual DB handles are closed. @@ -1138,7 +1246,9 @@ struct MDB_txn { * @{ */ /** #mdb_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY +#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY) +#define MDB_TXN_NOMETASYNC MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ +#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ /* internal txn flags */ #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ @@ -1204,10 +1314,19 @@ struct MDB_cursor { #define C_SUB 0x04 /**< Cursor is a sub-cursor */ #define C_DEL 0x08 /**< last op was a cursor_del */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +#define C_WRITEMAP MDB_TXN_WRITEMAP /**< Copy of txn flag */ +/** Read-only cursor into the txn's original snapshot in the map. + * Set for read-only txns, and in #mdb_page_alloc() for #FREE_DBI when + * #MDB_DEVEL & 2. Only implements code which is necessary for this. + */ +#define C_ORIG_RDONLY MDB_TXN_RDONLY /** @} */ unsigned int mc_flags; /**< @ref mdb_cursor */ MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +#ifdef MDB_VL32 + MDB_page *mc_ovpg; /**< a referenced overflow page */ +#endif }; /** Context for sorted-dup records. @@ -1226,23 +1345,6 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; - /** Check if there is an inited xcursor */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - - /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed - * when the node which contains the sub-page may have moved. Called - * with leaf page \b mp = mc->mc_pg[\b top]. - */ -#define XCURSOR_REFRESH(mc, top, mp) do { \ - MDB_page *xr_pg = (mp); \ - MDB_node *xr_node; \ - if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ - xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ - if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ -} while (0) - /** State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ @@ -1253,7 +1355,10 @@ typedef struct MDB_pgstate { struct MDB_env { HANDLE me_fd; /**< The main data file */ HANDLE me_lfd; /**< The lock file */ - HANDLE me_mfd; /**< For writing and syncing the meta pages */ + HANDLE me_mfd; /**< just for writing the meta pages */ +#if defined(MDB_VL32) && defined(_WIN32) + HANDLE me_fmh; /**< File Mapping handle */ +#endif /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U /** Some fields are initialized. */ @@ -1278,8 +1383,8 @@ struct MDB_env { void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - off_t me_size; /**< current file size */ + mdb_size_t me_mapsize; /**< size of the data memory map */ + off64_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ @@ -1311,6 +1416,13 @@ struct MDB_env { #else mdb_mutex_t me_rmutex; mdb_mutex_t me_wmutex; +#endif +#ifdef MDB_VL32 + MDB_ID3L me_rpages; /**< like #mt_rpages, but global to env */ + pthread_mutex_t me_rpmutex; /**< control access to #me_rpages */ +#define MDB_ERPAGE_SIZE 16384 +#define MDB_ERPAGE_MAX (MDB_ERPAGE_SIZE-1) + unsigned int me_rpcheck; #endif void *me_userctx; /**< User-settable context */ MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ @@ -1372,7 +1484,7 @@ static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned int nflags); -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta); static MDB_meta *mdb_env_pick_meta(const MDB_env *env); static int mdb_env_write_meta(MDB_txn *txn); #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ @@ -1431,8 +1543,7 @@ static SECURITY_DESCRIPTOR mdb_null_sd; static SECURITY_ATTRIBUTES mdb_all_sa; static int mdb_sec_inited; -struct MDB_name; -static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); +static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize); #endif /** Return the library version info. */ @@ -1611,20 +1722,20 @@ mdb_page_list(MDB_page *mp) case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; case P_OVERFLOW: - fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + fprintf(stderr, "Overflow page %"Y"u pages %u%s\n", pgno, mp->mp_pages, state); return; case P_META: - fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + fprintf(stderr, "Meta-page %"Y"u txnid %"Y"u\n", pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); return; default: - fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags); + fprintf(stderr, "Bad page %"Y"u flags 0x%u\n", pgno, mp->mp_flags); return; } nkeys = NUMKEYS(mp); - fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + fprintf(stderr, "%s %"Y"u numkeys %d%s\n", type, pgno, nkeys, state); for (i=0; imn_data; nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { - fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), + fprintf(stderr, "key %d: page %"Y"u, %s\n", i, NODEPGNO(node), DKEY(&key)); total += nsize; } else { @@ -1674,7 +1785,7 @@ mdb_cursor_chk(MDB_cursor *mc) } if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) printf("ack!\n"); - if (XCURSOR_INITED(mc)) { + if (mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { @@ -1735,7 +1846,7 @@ static void mdb_audit(MDB_txn *txn) } } if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", + fprintf(stderr, "audit: %"Y"u freecount: %"Y"u count: %"Y"u total: %"Y"u next_pgno: %"Y"u\n", txn->mt_txnid, freecount, count+NUM_METAS, freecount+count+NUM_METAS, txn->mt_next_pgno); } @@ -1752,8 +1863,8 @@ int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (dcmp == mdb_cmp_int && a->mv_size == sizeof(mdb_size_t)) dcmp = mdb_cmp_clong; #endif return dcmp(a, b); @@ -1761,7 +1872,6 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) /** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. - * Set #MDB_TXN_ERROR on failure. */ static MDB_page * mdb_page_malloc(MDB_txn *txn, unsigned num) @@ -1836,6 +1946,47 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } +#ifdef MDB_VL32 +static void +mdb_page_unref(MDB_txn *txn, MDB_page *mp) +{ + pgno_t pgno; + MDB_ID3L tl = txn->mt_rpages; + unsigned x, rem; + if (mp->mp_flags & (P_SUBP|P_DIRTY)) + return; + rem = mp->mp_pgno & (MDB_RPAGE_CHUNK-1); + pgno = mp->mp_pgno ^ rem; + x = mdb_mid3l_search(tl, pgno); + if (x != tl[0].mid && tl[x+1].mid == mp->mp_pgno) + x++; + if (tl[x].mref) + tl[x].mref--; +} +#define MDB_PAGE_UNREF(txn, mp) mdb_page_unref(txn, mp) + +static void +mdb_cursor_unref(MDB_cursor *mc) +{ + int i; + if (mc->mc_txn->mt_rpages[0].mid) { + if (!mc->mc_snum || !mc->mc_pg[0] || IS_SUBP(mc->mc_pg[0])) + return; + for (i=0; imc_snum; i++) + mdb_page_unref(mc->mc_txn, mc->mc_pg[i]); + if (mc->mc_ovpg) { + mdb_page_unref(mc->mc_txn, mc->mc_ovpg); + mc->mc_ovpg = 0; + } + } + mc->mc_snum = mc->mc_top = 0; + mc->mc_pg[0] = NULL; + mc->mc_flags &= ~C_INITIALIZED; +} +#else +#define MDB_PAGE_UNREF(txn, mp) +#endif /* MDB_VL32 */ + /** Loosen or free a single page. * Saves single pages to a list for future reuse * in this same txn. It has been pulled from the freeDB @@ -1877,7 +2028,7 @@ mdb_page_loose(MDB_cursor *mc, MDB_page *mp) } } if (loose) { - DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), + DPRINTF(("loosen db %d page %"Y"u", DDBI(mc), mp->mp_pgno)); NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; txn->mt_loose_pgs = mp; @@ -2129,13 +2280,15 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } /** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * me_pghead and mt_next_pgno. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. * Do not modify the freedB, just merge freeDB records into me_pghead[] * and move me_pglast to say which records were consumed. Only this * function can create me_pghead and move me_pglast/mt_next_pgno. + * When #MDB_DEVEL & 2, it is not affected by #mdb_freelist_save(): it + * then uses the transaction's original snapshot of the freeDB. * @param[in] mc cursor A cursor handle identifying the transaction and * database for which we are allocating. * @param[in] num the number of pages to allocate. @@ -2173,7 +2326,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) np = txn->mt_loose_pgs; txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; - DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), + DPRINTF(("db %d use loose page %"Y"u", DDBI(mc), np->mp_pgno)); *mp = np; return MDB_SUCCESS; @@ -2211,6 +2364,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) last = env->me_pglast; oldest = env->me_pgoldest; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); +#if (MDB_DEVEL) & 2 /* "& 2" so MDB_DEVEL=1 won't hide bugs breaking freeDB */ + /* Use original snapshot. TODO: Should need less care in code + * which modifies the database. Maybe we can delete some code? + */ + m2.mc_flags |= C_ORIG_RDONLY; + m2.mc_db = &env->me_metas[(txn->mt_txnid-1) & 1]->mm_dbs[FREE_DBI]; + m2.mc_dbflag = (unsigned char *)""; /* probably unnecessary */ +#endif if (last) { op = MDB_SET_RANGE; key.mv_data = &last; /* will look up last+1 */ @@ -2252,7 +2413,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) - goto fail; + return rc; idl = (MDB_ID *) data.mv_data; i = idl[0]; @@ -2268,10 +2429,10 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } env->me_pglast = last; #if (MDB_DEBUG) > 1 - DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + DPRINTF(("IDL read txn %"Y"u root %"Y"u num %u", last, txn->mt_dbs[FREE_DBI].md_root, i)); for (j = i; j; j--) - DPRINTF(("IDL %"Z"u", idl[j])); + DPRINTF(("IDL %"Y"u", idl[j])); #endif /* Merge in descending sorted order */ mdb_midl_xmerge(mop, idl); @@ -2286,6 +2447,20 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) rc = MDB_MAP_FULL; goto fail; } +#if defined(_WIN32) && !defined(MDB_VL32) + if (!(env->me_flags & MDB_RDONLY)) { + void *p; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT, + (env->me_flags & MDB_WRITEMAP) ? PAGE_READWRITE: + PAGE_READONLY); + if (!p) { + DPUTS("VirtualAlloc failed"); + rc = ErrCode(); + goto fail; + } + } +#endif search_done: if (env->me_flags & MDB_WRITEMAP) { @@ -2402,7 +2577,6 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) } /** Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. */ @@ -2428,7 +2602,7 @@ mdb_page_touch(MDB_cursor *mc) (rc = mdb_page_alloc(mc, 1, &np))) goto fail; pgno = np->mp_pgno; - DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + DPRINTF(("touched db %d page %"Y"u -> %"Y"u", DDBI(mc), mp->mp_pgno, pgno)); mdb_cassert(mc, mp->mp_pgno != pgno); mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); @@ -2491,11 +2665,18 @@ done: if (m2 == mc) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; - if (IS_LEAF(np)) - XCURSOR_REFRESH(m2, mc->mc_top, np); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + IS_LEAF(np) && + (m2->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + { + MDB_node *leaf = NODEPTR(np, m2->mc_ki[mc->mc_top]); + if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } } } } + MDB_PAGE_UNREF(mc->mc_txn, mp); return 0; fail: @@ -2504,7 +2685,7 @@ fail: } int -mdb_env_sync(MDB_env *env, int force) +mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs) { int rc = 0; if (env->me_flags & MDB_RDONLY) @@ -2513,7 +2694,7 @@ mdb_env_sync(MDB_env *env, int force) if (env->me_flags & MDB_WRITEMAP) { int flags = ((env->me_flags & MDB_MAPASYNC) && !force) ? MS_ASYNC : MS_SYNC; - if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + if (MDB_MSYNC(env->me_map, env->me_psize * numpgs, flags)) rc = ErrCode(); #ifdef _WIN32 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) @@ -2533,6 +2714,13 @@ mdb_env_sync(MDB_env *env, int force) return rc; } +int +mdb_env_sync(MDB_env *env, int force) +{ + MDB_meta *m = mdb_env_pick_meta(env); + return mdb_env_sync0(env, force, m->mm_last_pg+1); +} + /** Back up parent txn's cursors, then grab the originals for tracking */ static int mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) @@ -2777,6 +2965,9 @@ mdb_txn_renew0(MDB_txn *txn) /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg+1; +#ifdef MDB_VL32 + txn->mt_last_pgno = txn->mt_next_pgno - 1; +#endif txn->mt_flags = flags; @@ -2812,7 +3003,7 @@ mdb_txn_renew(MDB_txn *txn) rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { - DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + DPRINTF(("renew txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -2855,6 +3046,17 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) DPRINTF(("calloc: %s", strerror(errno))); return ENOMEM; } +#ifdef MDB_VL32 + if (!parent) { + txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); + if (!txn->mt_rpages) { + free(txn); + return ENOMEM; + } + txn->mt_rpages[0].mid = 0; + txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; + } +#endif txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; @@ -2882,6 +3084,9 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; +#ifdef MDB_VL32 + txn->mt_rpages = parent->mt_rpages; +#endif memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ for (i=0; imt_numdbs; i++) @@ -2907,12 +3112,16 @@ renew: rc = mdb_txn_renew0(txn); } if (rc) { - if (txn != env->me_txn0) + if (txn != env->me_txn0) { +#ifdef MDB_VL32 + free(txn->mt_rpages); +#endif free(txn); + } } else { txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ *ret = txn; - DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + DPRINTF(("begin txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -2927,7 +3136,7 @@ mdb_txn_env(MDB_txn *txn) return txn->mt_env; } -size_t +mdb_size_t mdb_txn_id(MDB_txn *txn) { if(!txn) return 0; @@ -2979,7 +3188,7 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) /* Export or close DBI handles opened in this txn */ mdb_dbis_update(txn, mode & MDB_END_UPDATE); - DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + DPRINTF(("%s txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", names[mode & MDB_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); @@ -3033,7 +3242,31 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) mdb_midl_free(pghead); } - +#ifdef MDB_VL32 + if (!txn->mt_parent) { + MDB_ID3L el = env->me_rpages, tl = txn->mt_rpages; + unsigned i, x, n = tl[0].mid; + pthread_mutex_lock(&env->me_rpmutex); + for (i = 1; i <= n; i++) { + if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { + /* tmp overflow pages that we didn't share in env */ + munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + } else { + x = mdb_mid3l_search(el, tl[i].mid); + if (tl[i].mptr == el[x].mptr) { + el[x].mref--; + } else { + /* another tmp overflow page */ + munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + } + } + } + pthread_mutex_unlock(&env->me_rpmutex); + tl[0].mid = 0; + if (mode & MDB_END_FREE) + free(tl); + } +#endif if (mode & MDB_END_FREE) free(txn); } @@ -3065,6 +3298,9 @@ mdb_txn_abort(MDB_txn *txn) /** Save the freelist as of this transaction to the freeDB. * This changes the freelist. Keep trying until it stabilizes. + * + * When (MDB_DEVEL) & 2, the changes do not affect #mdb_page_alloc(), + * it then uses the transaction's original snapshot of the freeDB. */ static int mdb_freelist_save(MDB_txn *txn) @@ -3094,41 +3330,10 @@ mdb_freelist_save(MDB_txn *txn) * we may be unable to return them to me_pghead. */ MDB_page *mp = txn->mt_loose_pgs; - MDB_ID2 *dl = txn->mt_u.dirty_list; - unsigned x; if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) return rc; - for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - /* must also remove from dirty list */ - if (txn->mt_flags & MDB_TXN_WRITEMAP) { - for (x=1; x<=dl[0].mid; x++) - if (dl[x].mid == mp->mp_pgno) - break; - mdb_tassert(txn, x <= dl[0].mid); - } else { - x = mdb_mid2l_search(dl, mp->mp_pgno); - mdb_tassert(txn, dl[x].mid == mp->mp_pgno); - mdb_dpage_free(env, mp); - } - dl[x].mptr = NULL; - } - { - /* squash freed slots out of the dirty list */ - unsigned y; - for (y=1; dl[y].mptr && y <= dl[0].mid; y++); - if (y <= dl[0].mid) { - for(x=y, y++;;) { - while (!dl[y].mptr && y <= dl[0].mid) y++; - if (y > dl[0].mid) break; - dl[x++] = dl[y++]; - } - dl[0].mid = x-1; - } else { - /* all slots freed */ - dl[0].mid = 0; - } - } txn->mt_loose_pgs = NULL; txn->mt_loose_count = 0; } @@ -3184,10 +3389,10 @@ mdb_freelist_save(MDB_txn *txn) #if (MDB_DEBUG) > 1 { unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + DPRINTF(("IDL write txn %"Y"u root %"Y"u num %u", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); for (; i; i--) - DPRINTF(("IDL %"Z"u", free_pgs[i])); + DPRINTF(("IDL %"Y"u", free_pgs[i])); } #endif continue; @@ -3298,15 +3503,16 @@ mdb_page_flush(MDB_txn *txn, int keep) MDB_ID2L dl = txn->mt_u.dirty_list; unsigned psize = env->me_psize, j; int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; + size_t size = 0; + off64_t pos = 0; pgno_t pgno = 0; MDB_page *dp = NULL; #ifdef _WIN32 OVERLAPPED ov; #else struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + ssize_t wsize = 0, wres; + off64_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; #endif @@ -3405,7 +3611,7 @@ retry_seek: wpos = pos; wsize = 0; } - DPRINTF(("committing page %"Z"u", pgno)); + DPRINTF(("committing page %"Y"u", pgno)); next_pos = pos + size; iov[n].iov_len = size; iov[n].iov_base = (char *)dp; @@ -3413,6 +3619,10 @@ retry_seek: n++; #endif /* _WIN32 */ } +#ifdef MDB_VL32 + if (pgno > txn->mt_last_pgno) + txn->mt_last_pgno = pgno; +#endif /* MIPS has cache coherency issues, this is a no-op everywhere else * Note: for any size >= on-chip cache size, entire on-chip cache is @@ -3438,6 +3648,8 @@ done: return MDB_SUCCESS; } +static int ESECT mdb_env_share_locks(MDB_env *env, int *excl); + int mdb_txn_commit(MDB_txn *txn) { @@ -3614,7 +3826,7 @@ mdb_txn_commit(MDB_txn *txn) !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) goto done; - DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + DPRINTF(("committing txn %"Y"u %p on mdbenv %p, root page %"Y"u", txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); /* Update DB root pointers */ @@ -3652,11 +3864,23 @@ mdb_txn_commit(MDB_txn *txn) mdb_audit(txn); #endif - if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync(env, 0)) || - (rc = mdb_env_write_meta(txn))) + if ((rc = mdb_page_flush(txn, 0))) + goto fail; + if (!F_ISSET(txn->mt_flags, MDB_TXN_NOSYNC) && + (rc = mdb_env_sync0(env, 0, txn->mt_next_pgno))) + goto fail; + if ((rc = mdb_env_write_meta(txn))) goto fail; end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; + if (env->me_flags & MDB_PREVSNAPSHOT) { + if (!(env->me_flags & MDB_NOLOCK)) { + int excl; + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto fail; + } + env->me_flags ^= MDB_PREVSNAPSHOT; + } done: mdb_txn_end(txn, end_mode); @@ -3670,11 +3894,12 @@ fail: /** Read the environment parameters of a DB environment before * mapping it into memory. * @param[in] env the environment handle + * @param[in] prev whether to read the backup meta page * @param[out] meta address of where to store the meta information * @return 0 on success, non-zero on failure. */ static int ESECT -mdb_env_read_header(MDB_env *env, MDB_meta *meta) +mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta) { MDB_metabuf pbuf; MDB_page *p; @@ -3709,7 +3934,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) p = (MDB_page *)&pbuf; if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + DPRINTF(("page %"Y"u not a meta page", p->mp_pgno)); return MDB_INVALID; } @@ -3725,7 +3950,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) return MDB_VERSION_MISMATCH; } - if (off == 0 || m->mm_txnid > meta->mm_txnid) + if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid)) *meta = *m; } return 0; @@ -3779,7 +4004,6 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) p = calloc(NUM_METAS, psize); if (!p) return ENOMEM; - p->mp_pgno = 0; p->mp_flags = P_META; *(MDB_meta *)METADATA(p) = *meta; @@ -3810,8 +4034,8 @@ mdb_env_write_meta(MDB_txn *txn) MDB_env *env; MDB_meta meta, metab, *mp; unsigned flags; - size_t mapsize; - off_t off; + mdb_size_t mapsize; + off64_t off; int rc, len, toggle; char *ptr; HANDLE mfd; @@ -3822,11 +4046,11 @@ mdb_env_write_meta(MDB_txn *txn) #endif toggle = txn->mt_txnid & 1; - DPRINTF(("writing meta page %d for root page %"Z"u", + DPRINTF(("writing meta page %d for root page %"Y"u", toggle, txn->mt_dbs[MAIN_DBI].md_root)); env = txn->mt_env; - flags = env->me_flags; + flags = txn->mt_flags | env->me_flags; mp = env->me_metas[toggle]; mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; /* Persist any increases of mapsize config */ @@ -3874,10 +4098,7 @@ mdb_env_write_meta(MDB_txn *txn) len = sizeof(MDB_meta) - off; off += (char *)mp - env->me_map; - /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. - * (me_mfd goes to the same file as me_fd, but writing to it - * also syncs to disk. Avoids a separate fdatasync() call.) - */ + /* Write to the SYNC fd */ mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; #ifdef _WIN32 { @@ -3938,7 +4159,8 @@ static MDB_meta * mdb_env_pick_meta(const MDB_env *env) { MDB_meta *const *metas = env->me_metas; - return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; + return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^ + ((env->me_flags & MDB_PREVSNAPSHOT) != 0) ]; } int ESECT @@ -3958,6 +4180,9 @@ mdb_env_create(MDB_env **env) #ifdef MDB_USE_POSIX_SEM e->me_rmutex = SEM_FAILED; e->me_wmutex = SEM_FAILED; +#elif defined MDB_USE_SYSV_SEM + e->me_rmutex->semid = -1; + e->me_wmutex->semid = -1; #endif e->me_pid = getpid(); GET_PAGESIZE(e->me_os_psize); @@ -3966,6 +4191,19 @@ mdb_env_create(MDB_env **env) return MDB_SUCCESS; } +#ifdef _WIN32 +/** @brief Map a result from an NTAPI call to WIN32. */ +static DWORD +mdb_nt2win32(NTSTATUS st) +{ + OVERLAPPED o = {0}; + DWORD br; + o.Internal = st; + GetOverlappedResult(NULL, &o, &br, FALSE); + return GetLastError(); +} +#endif + static int ESECT mdb_env_map(MDB_env *env, void *addr) { @@ -3973,42 +4211,51 @@ mdb_env_map(MDB_env *env, void *addr) unsigned int flags = env->me_flags; #ifdef _WIN32 int rc; + int access = SECTION_MAP_READ; HANDLE mh; - LONG sizelo, sizehi; - size_t msize; + void *map; + SIZE_T msize; + ULONG pageprot = PAGE_READONLY, secprot, alloctype; + if (flags & MDB_WRITEMAP) { + access |= SECTION_MAP_WRITE; + pageprot = PAGE_READWRITE; + } if (flags & MDB_RDONLY) { - /* Don't set explicit map size, use whatever exists */ + secprot = PAGE_READONLY; msize = 0; - sizelo = 0; - sizehi = 0; + alloctype = 0; } else { + secprot = PAGE_READWRITE; msize = env->me_mapsize; - sizelo = msize & 0xffffffff; - sizehi = msize >> 16 >> 16; /* only needed on Win64 */ - - /* Windows won't create mappings for zero length files. - * and won't map more than the file size. - * Just set the maxsize right now. - */ - if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo - || !SetEndOfFile(env->me_fd) - || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) - return ErrCode(); + alloctype = MEM_RESERVE; } - mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? - PAGE_READWRITE : PAGE_READONLY, - sizehi, sizelo, NULL); - if (!mh) - return ErrCode(); - env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? - FILE_MAP_WRITE : FILE_MAP_READ, - 0, 0, msize, addr); - rc = env->me_map ? 0 : ErrCode(); - CloseHandle(mh); + rc = NtCreateSection(&mh, access, NULL, NULL, secprot, SEC_RESERVE, env->me_fd); if (rc) - return rc; + return mdb_nt2win32(rc); + map = addr; +#ifdef MDB_VL32 + msize = NUM_METAS * env->me_psize; +#endif + rc = NtMapViewOfSection(mh, GetCurrentProcess(), &map, 0, 0, NULL, &msize, ViewUnmap, alloctype, pageprot); +#ifdef MDB_VL32 + env->me_fmh = mh; +#else + NtClose(mh); +#endif + if (rc) + return mdb_nt2win32(rc); + env->me_map = map; +#else +#ifdef MDB_VL32 + (void) flags; + env->me_map = mmap(addr, NUM_METAS * env->me_psize, PROT_READ, MAP_SHARED, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } #else int prot = PROT_READ; if (flags & MDB_WRITEMAP) { @@ -4042,6 +4289,7 @@ mdb_env_map(MDB_env *env, void *addr) */ if (addr && env->me_map != addr) return EBUSY; /* TODO: Make a new MDB_* error code? */ +#endif p = (MDB_page *)env->me_map; env->me_metas[0] = METADATA(p); @@ -4051,15 +4299,17 @@ mdb_env_map(MDB_env *env, void *addr) } int ESECT -mdb_env_set_mapsize(MDB_env *env, size_t size) +mdb_env_set_mapsize(MDB_env *env, mdb_size_t size) { /* If env is already open, caller is responsible for making * sure there are no active txns. */ if (env->me_map) { - int rc; MDB_meta *meta; +#ifndef MDB_VL32 void *old; + int rc; +#endif if (env->me_txn) return EINVAL; meta = mdb_env_pick_meta(env); @@ -4067,16 +4317,21 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) size = meta->mm_mapsize; { /* Silently round up to minimum if the size is too small */ - size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; + mdb_size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; if (size < minsize) size = minsize; } +#ifndef MDB_VL32 + /* For MDB_VL32 this bit is a noop since we dynamically remap + * chunks of the DB anyway. + */ munmap(env->me_map, env->me_mapsize); env->me_mapsize = size; old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; rc = mdb_env_map(env, old); if (rc) return rc; +#endif /* !MDB_VL32 */ } env->me_mapsize = size; if (env->me_psize) @@ -4112,7 +4367,7 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) } static int ESECT -mdb_fsize(HANDLE fd, size_t *size) +mdb_fsize(HANDLE fd, mdb_size_t *size) { #ifdef _WIN32 LARGE_INTEGER fsize; @@ -4132,189 +4387,6 @@ mdb_fsize(HANDLE fd, size_t *size) return MDB_SUCCESS; } - -#ifdef _WIN32 -typedef wchar_t mdb_nchar_t; -# define MDB_NAME(str) L##str -# define mdb_name_cpy wcscpy -#else -/** Character type for file names: char on Unix, wchar_t on Windows */ -typedef char mdb_nchar_t; -# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ -# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ -#endif - -/** Filename - string of #mdb_nchar_t[] */ -typedef struct MDB_name { - int mn_len; /**< Length */ - int mn_alloced; /**< True if #mn_val was malloced */ - mdb_nchar_t *mn_val; /**< Contents */ -} MDB_name; - -/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ -static const mdb_nchar_t *const mdb_suffixes[2][2] = { - { MDB_NAME("/data.mdb"), MDB_NAME("") }, - { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } -}; - -#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ - -/** Set up filename + scratch area for filename suffix, for opening files. - * It should be freed with #mdb_fname_destroy(). - * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. - * - * @param[in] path Pathname for #mdb_env_open(). - * @param[in] envflags Whether a subdir and/or lockfile will be used. - * @param[out] fname Resulting filename, with room for a suffix if necessary. - */ -static int ESECT -mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) -{ - int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); - fname->mn_alloced = 0; -#ifdef _WIN32 - return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); -#else - fname->mn_len = strlen(path); - if (no_suffix) - fname->mn_val = (char *) path; - else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { - fname->mn_alloced = 1; - strcpy(fname->mn_val, path); - } - else - return ENOMEM; - return MDB_SUCCESS; -#endif -} - -/** Destroy \b fname from #mdb_fname_init() */ -#define mdb_fname_destroy(fname) \ - do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) - -#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ -# define MDB_CLOEXEC O_CLOEXEC -#else -# define MDB_CLOEXEC 0 -#endif - -/** File type, access mode etc. for #mdb_fopen() */ -enum mdb_fopen_type { -#ifdef _WIN32 - MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS -#else - /* A comment in mdb_fopen() explains some O_* flag choices. */ - MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ - MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ - MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ - MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ - /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits - * distinguish otherwise-equal MDB_O_* constants from each other. - */ - MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, - MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ -#endif -}; - -/** Open an LMDB file. - * @param[in] env The LMDB environment. - * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is - * appended if necessary to create the filename, without changing mn_len. - * @param[in] which Determines file type, access mode, etc. - * @param[in] mode The Unix permissions for the file, if we create it. - * @param[out] res Resulting file handle. - * @return 0 on success, non-zero on failure. - */ -static int ESECT -mdb_fopen(const MDB_env *env, MDB_name *fname, - enum mdb_fopen_type which, mdb_mode_t mode, - HANDLE *res) -{ - int rc = MDB_SUCCESS; - HANDLE fd; -#ifdef _WIN32 - DWORD acc, share, disp, attrs; -#else - int flags; -#endif - - if (fname->mn_alloced) /* modifiable copy */ - mdb_name_cpy(fname->mn_val + fname->mn_len, - mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); - - /* The directory must already exist. Usually the file need not. - * MDB_O_META requires the file because we already created it using - * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. - * - * With MDB_O_COPY we do not want the OS to cache the writes, since - * the source data is already in the OS cache. - * - * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) - * to avoid the flock() issues noted under Caveats in lmdb.h. - * Also set it for other filehandles which the user cannot get at - * and close himself, which he may need after fork(). I.e. all but - * me_fd, which programs do use via mdb_env_get_fd(). - */ - -#ifdef _WIN32 - acc = GENERIC_READ|GENERIC_WRITE; - share = FILE_SHARE_READ|FILE_SHARE_WRITE; - disp = OPEN_ALWAYS; - attrs = FILE_ATTRIBUTE_NORMAL; - switch (which) { - case MDB_O_RDONLY: /* read-only datafile */ - acc = GENERIC_READ; - disp = OPEN_EXISTING; - break; - case MDB_O_META: /* for writing metapages */ - acc = GENERIC_WRITE; - disp = OPEN_EXISTING; - attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; - break; - case MDB_O_COPY: /* mdb_env_copy() & co */ - acc = GENERIC_WRITE; - share = 0; - disp = CREATE_NEW; - attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; - break; - default: break; /* silence gcc -Wswitch (not all enum values handled) */ - } - fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); -#else - fd = open(fname->mn_val, which & MDB_O_MASK, mode); -#endif - - if (fd == INVALID_HANDLE_VALUE) - rc = ErrCode(); -#ifndef _WIN32 - else { - if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { - /* Set CLOEXEC if we could not pass it to open() */ - if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) - (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); - } - if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { - /* This may require buffer alignment. There is no portable - * way to ask how much, so we require OS pagesize alignment. - */ -# ifdef F_NOCACHE /* __APPLE__ */ - (void) fcntl(fd, F_NOCACHE, 1); -# elif defined O_DIRECT - /* open(...O_DIRECT...) would break on filesystems without - * O_DIRECT support (ITS#7682). Try to set it here instead. - */ - if ((flags = fcntl(fd, F_GETFL)) != -1) - (void) fcntl(fd, F_SETFL, flags | O_DIRECT); -# endif - } - } -#endif /* !_WIN32 */ - - *res = fd; - return rc; -} - - #ifdef BROKEN_FDATASYNC #include #include @@ -4323,7 +4395,7 @@ mdb_fopen(const MDB_env *env, MDB_name *fname, /** Further setup required for opening an LMDB environment */ static int ESECT -mdb_env_open2(MDB_env *env) +mdb_env_open2(MDB_env *env, int prev) { unsigned int flags = env->me_flags; int i, newenv = 0, rc; @@ -4386,7 +4458,7 @@ mdb_env_open2(MDB_env *env) } #endif - if ((i = mdb_env_read_header(env, &meta)) != 0) { + if ((i = mdb_env_read_header(env, prev, &meta)) != 0) { if (i != ENOENT) return i; DPUTS("new mdbenv"); @@ -4409,7 +4481,7 @@ mdb_env_open2(MDB_env *env) /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ - size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + mdb_size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; if (env->me_mapsize < minsize) env->me_mapsize = minsize; } @@ -4428,6 +4500,18 @@ mdb_env_open2(MDB_env *env) return rc; newenv = 0; } +#ifdef _WIN32 + /* For FIXEDMAP, make sure the file is non-empty before we attempt to map it */ + if (newenv) { + char dummy = 0; + DWORD len; + rc = WriteFile(env->me_fd, &dummy, 1, &len, NULL); + if (!rc) { + rc = ErrCode(); + return rc; + } + } +#endif rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); if (rc) @@ -4450,6 +4534,9 @@ mdb_env_open2(MDB_env *env) #endif env->me_maxpg = env->me_mapsize / env->me_psize; + if (env->me_txns) + env->me_txns->mti_txnid = meta.mm_txnid; + #if MDB_DEBUG { MDB_meta *meta = mdb_env_pick_meta(env); @@ -4459,11 +4546,11 @@ mdb_env_open2(MDB_env *env) meta->mm_version, env->me_psize)); DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); DPRINTF(("depth: %u", db->md_depth)); - DPRINTF(("entries: %"Z"u", db->md_entries)); - DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); - DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); - DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); - DPRINTF(("root: %"Z"u", db->md_root)); + DPRINTF(("entries: %"Y"u", db->md_entries)); + DPRINTF(("branch pages: %"Y"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Y"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Y"u", db->md_overflow_pages)); + DPRINTF(("root: %"Y"u", db->md_root)); } #endif @@ -4480,11 +4567,7 @@ mdb_env_reader_dest(void *ptr) { MDB_reader *reader = ptr; -#ifndef _WIN32 - if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ -#endif - /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ - reader->mr_pid = 0; + reader->mr_pid = 0; } #ifdef _WIN32 @@ -4549,9 +4632,6 @@ static int ESECT mdb_env_share_locks(MDB_env *env, int *excl) { int rc = 0; - MDB_meta *meta = mdb_env_pick_meta(env); - - env->me_txns->mti_txnid = meta->mm_txnid; #ifdef _WIN32 { @@ -4724,30 +4804,56 @@ mdb_hash_enc(MDB_val *val, char *encbuf) /** Open and/or initialize the lock region for the environment. * @param[in] env The LMDB environment. - * @param[in] fname Filename + scratch area, from #mdb_fname_init(). + * @param[in] lpath The pathname of the file used for the lock region. * @param[in] mode The Unix permissions for the file, if we create it. * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ static int ESECT -mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) +mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { #ifdef _WIN32 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT #else # define MDB_ERRCODE_ROFS EROFS +#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ +# define MDB_CLOEXEC O_CLOEXEC +#else + int fdflags; +# define MDB_CLOEXEC 0 +#endif +#endif +#ifdef MDB_USE_SYSV_SEM + int semid; + union semun semu; #endif int rc; - off_t size, rsize; + off64_t size, rsize; - rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); - if (rc) { - /* Omit lockfile if read-only env on read-only filesystem */ +#ifdef _WIN32 + wchar_t *wlpath; + rc = utf8_to_utf16(lpath, -1, &wlpath, NULL); + if (rc) + return rc; + env->me_lfd = CreateFileW(wlpath, GENERIC_READ|GENERIC_WRITE, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, + FILE_ATTRIBUTE_NORMAL, NULL); + free(wlpath); +#else + env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); +#endif + if (env->me_lfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } - goto fail; + goto fail_errno; } +#if ! ((MDB_CLOEXEC) || defined(_WIN32)) + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_lfd, F_SETFD, fdflags); +#endif if (!(env->me_flags & MDB_NOTLS)) { rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); @@ -4872,29 +4978,31 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) env->me_wmutex = sem_open(env->me_txns->mti_wmname, O_CREAT|O_EXCL, mode, 1); if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#elif defined(MDB_USE_SYSV_SEM) + unsigned short vals[2] = {1, 1}; + key_t key = ftok(lpath, 'M'); + if (key == -1) + goto fail_errno; + semid = semget(key, 2, (mode & 0777) | IPC_CREAT); + if (semid < 0) + goto fail_errno; + semu.array = vals; + if (semctl(semid, 0, SETALL, semu) < 0) + goto fail_errno; + env->me_txns->mti_semid = semid; #else /* MDB_USE_POSIX_MUTEX: */ pthread_mutexattr_t mattr; - /* Solaris needs this before initing a robust mutex. Otherwise - * it may skip the init and return EBUSY "seems someone already - * inited" or EINVAL "it was inited differently". - */ - memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); - memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); - - if ((rc = pthread_mutexattr_init(&mattr))) - goto fail; - - rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); + if ((rc = pthread_mutexattr_init(&mattr)) + || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) #ifdef MDB_ROBUST_SUPPORTED - if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); + || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)) #endif - if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); - if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); - pthread_mutexattr_destroy(&mattr); - if (rc) + || (rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr)) + || (rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr))) goto fail; -#endif /* _WIN32 || MDB_USE_POSIX_SEM */ + pthread_mutexattr_destroy(&mattr); +#endif /* _WIN32 || ... */ env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_format = MDB_LOCK_FORMAT; @@ -4902,6 +5010,9 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) env->me_txns->mti_numreaders = 0; } else { +#ifdef MDB_USE_SYSV_SEM + struct semid_ds buf; +#endif if (env->me_txns->mti_magic != MDB_MAGIC) { DPUTS("lock region has invalid magic"); rc = MDB_INVALID; @@ -4927,8 +5038,33 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) if (env->me_rmutex == SEM_FAILED) goto fail_errno; env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#elif defined(MDB_USE_SYSV_SEM) + semid = env->me_txns->mti_semid; + semu.buf = &buf; + /* check for read access */ + if (semctl(semid, 0, IPC_STAT, semu) < 0) + goto fail_errno; + /* check for write access */ + if (semctl(semid, 0, IPC_SET, semu) < 0) + goto fail_errno; #endif } +#ifdef MDB_USE_SYSV_SEM + env->me_rmutex->semid = semid; + env->me_wmutex->semid = semid; + env->me_rmutex->semnum = 0; + env->me_wmutex->semnum = 1; + env->me_rmutex->locked = &env->me_txns->mti_rlocked; + env->me_wmutex->locked = &env->me_txns->mti_wlocked; +#endif +#ifdef MDB_VL32 +#ifdef _WIN32 + env->me_rpmutex = CreateMutex(NULL, FALSE, NULL); +#else + pthread_mutex_init(&env->me_rpmutex, NULL); +#endif +#endif + return MDB_SUCCESS; fail_errno: @@ -4937,13 +5073,19 @@ fail: return rc; } + /** The name of the lock file in the DB environment */ +#define LOCKNAME "/lock.mdb" + /** The name of the data file in the DB environment */ +#define DATANAME "/data.mdb" + /** The suffix of the lock file when no subdir is used */ +#define LOCKSUFF "-lock" /** Only a subset of the @ref mdb_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ - MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVSNAPSHOT) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) # error "Persistent DB flags & env flags overlap, but both go in mm_flags" @@ -4952,18 +5094,47 @@ fail: int ESECT mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { - int rc, excl = -1; - MDB_name fname; + int oflags, rc, len, excl = -1; + char *lpath, *dpath; +#ifdef _WIN32 + wchar_t *wpath; +#endif if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; +#ifdef MDB_VL32 + if (flags & MDB_WRITEMAP) { + /* silently ignore WRITEMAP in 32 bit mode */ + flags ^= MDB_WRITEMAP; + } + if (flags & MDB_FIXEDMAP) { + /* cannot support FIXEDMAP */ + return EINVAL; + } +#endif + + len = strlen(path); + if (flags & MDB_NOSUBDIR) { + rc = len + sizeof(LOCKSUFF) + len + 1; + } else { + rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + } + lpath = malloc(rc); + if (!lpath) + return ENOMEM; + if (flags & MDB_NOSUBDIR) { + dpath = lpath + len + sizeof(LOCKSUFF); + sprintf(lpath, "%s" LOCKSUFF, path); + strcpy(dpath, path); + } else { + dpath = lpath + len + sizeof(LOCKNAME); + sprintf(lpath, "%s" LOCKNAME, path); + sprintf(dpath, "%s" DATANAME, path); + } + + rc = MDB_SUCCESS; flags |= env->me_flags; - - rc = mdb_fname_init(path, flags, &fname); - if (rc) - return rc; - if (flags & MDB_RDONLY) { /* silently ignore WRITEMAP when we're only getting read access */ flags &= ~MDB_WRITEMAP; @@ -4972,6 +5143,17 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) rc = ENOMEM; } +#ifdef MDB_VL32 + if (!rc) { + env->me_rpages = malloc(MDB_ERPAGE_SIZE * sizeof(MDB_ID3)); + if (!env->me_rpages) { + rc = ENOMEM; + goto leave; + } + env->me_rpages[0].mid = 0; + env->me_rpcheck = MDB_ERPAGE_SIZE/2; + } +#endif env->me_flags = flags |= MDB_ENV_ACTIVE; if (rc) goto leave; @@ -4988,34 +5170,76 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode /* For RDONLY, get lockfile after we know datafile exists */ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { - rc = mdb_env_setup_locks(env, &fname, mode, &excl); + rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; } - rc = mdb_fopen(env, &fname, - (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, - mode, &env->me_fd); +#ifdef _WIN32 + if (F_ISSET(flags, MDB_RDONLY)) { + oflags = GENERIC_READ; + len = OPEN_EXISTING; + } else { + oflags = GENERIC_READ|GENERIC_WRITE; + len = OPEN_ALWAYS; + } + mode = FILE_ATTRIBUTE_NORMAL; + rc = utf8_to_utf16(dpath, -1, &wpath, NULL); if (rc) goto leave; + env->me_fd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + NULL, len, mode, NULL); + free(wpath); +#else + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT; - if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { - rc = mdb_env_setup_locks(env, &fname, mode, &excl); - if (rc) - goto leave; + env->me_fd = open(dpath, oflags, mode); +#endif + if (env->me_fd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; } - if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { - if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + if ((flags & MDB_PREVSNAPSHOT) && !excl) { + rc = EAGAIN; + goto leave; + } + } + + if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { + if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { + env->me_mfd = env->me_fd; + } else { /* Synchronous fd for meta writes. Needed even with * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. */ - rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); +#ifdef _WIN32 + len = OPEN_EXISTING; + rc = utf8_to_utf16(dpath, -1, &wpath, NULL); if (rc) goto leave; + env->me_mfd = CreateFileW(wpath, oflags, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, + mode | FILE_FLAG_WRITE_THROUGH, NULL); + free(wpath); +#else + oflags &= ~O_CREAT; + env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); +#endif + if (env->me_mfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } } DPRINTF(("opened dbenv %p", (void *) env)); - if (excl > 0) { + if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { rc = mdb_env_share_locks(env, &excl); if (rc) goto leave; @@ -5032,6 +5256,16 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; +#ifdef MDB_VL32 + txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); + if (!txn->mt_rpages) { + free(txn); + rc = ENOMEM; + goto leave; + } + txn->mt_rpages[0].mid = 0; + txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; +#endif txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDB_TXN_FINISHED; env->me_txn0 = txn; @@ -5045,7 +5279,7 @@ leave: if (rc) { mdb_env_close0(env, excl); } - mdb_fname_destroy(fname); + free(lpath); return rc; } @@ -5070,6 +5304,15 @@ mdb_env_close0(MDB_env *env, int excl) free(env->me_dbflags); free(env->me_path); free(env->me_dirty_list); +#ifdef MDB_VL32 + if (env->me_txn0 && env->me_txn0->mt_rpages) + free(env->me_txn0->mt_rpages); + { unsigned int x; + for (x=1; x<=env->me_rpages[0].mid; x++) + munmap(env->me_rpages[x].mptr, env->me_rpages[x].mcnt * env->me_psize); + } + free(env->me_rpages); +#endif free(env->me_txn0); mdb_midl_free(env->me_free_pgs); @@ -5087,14 +5330,18 @@ mdb_env_close0(MDB_env *env, int excl) } if (env->me_map) { +#ifdef MDB_VL32 + munmap(env->me_map, NUM_METAS*env->me_psize); +#else munmap(env->me_map, env->me_mapsize); +#endif } - if (env->me_mfd != INVALID_HANDLE_VALUE) + if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) (void) close(env->me_mfd); if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); if (env->me_txns) { - MDB_PID_T pid = getpid(); + MDB_PID_T pid = env->me_pid; /* Clearing readers is done in this function because * me_txkey with its destructor must be disabled first. * @@ -5128,6 +5375,16 @@ mdb_env_close0(MDB_env *env, int excl) sem_unlink(env->me_txns->mti_wmname); } } +#elif defined(MDB_USE_SYSV_SEM) + if (env->me_rmutex->semid != -1) { + /* If we have the filelock: If we are the + * only remaining user, clean up semaphores. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) + semctl(env->me_rmutex->semid, 0, IPC_RMID); + } #endif munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); } @@ -5142,6 +5399,14 @@ mdb_env_close0(MDB_env *env, int excl) #endif (void) close(env->me_lfd); } +#ifdef MDB_VL32 +#ifdef _WIN32 + if (env->me_fmh) CloseHandle(env->me_fmh); + if (env->me_rpmutex) CloseHandle(env->me_rpmutex); +#else + pthread_mutex_destroy(&env->me_rpmutex); +#endif +#endif env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); } @@ -5165,18 +5430,18 @@ mdb_env_close(MDB_env *env) free(env); } -/** Compare two items pointing at aligned size_t's */ +/** Compare two items pointing at aligned mdb_size_t's */ static int mdb_cmp_long(const MDB_val *a, const MDB_val *b) { - return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : - *(size_t *)a->mv_data > *(size_t *)b->mv_data; + return (*(mdb_size_t *)a->mv_data < *(mdb_size_t *)b->mv_data) ? -1 : + *(mdb_size_t *)a->mv_data > *(mdb_size_t *)b->mv_data; } /** Compare two items pointing at aligned unsigned int's. * * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, - * but #mdb_cmp_clong() is called instead if the data type is size_t. + * but #mdb_cmp_clong() is called instead if the data type is mdb_size_t. */ static int mdb_cmp_int(const MDB_val *a, const MDB_val *b) @@ -5281,7 +5546,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) nkeys = NUMKEYS(mp); - DPRINTF(("searching %u keys in %s %spage %"Z"u", + DPRINTF(("searching %u keys in %s %spage %"Y"u", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdb_dbg_pgno(mp))); @@ -5293,7 +5558,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) * alignment is guaranteed. Use faster mdb_cmp_int. */ if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { - if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + if (NODEPTR(mp, 1)->mn_ksize == sizeof(mdb_size_t)) cmp = mdb_cmp_long; else cmp = mdb_cmp_int; @@ -5329,7 +5594,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) DPRINTF(("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc)); else - DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + DPRINTF(("found branch index %u [%s -> %"Y"u], rc = %i", i, DKEY(&nodekey), NODEPGNO(node), rc)); #endif if (rc == 0) @@ -5377,7 +5642,7 @@ static void mdb_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { - DPRINTF(("popping page %"Z"u off db %d cursor %p", + DPRINTF(("popping page %"Y"u off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); mc->mc_snum--; @@ -5389,13 +5654,11 @@ mdb_cursor_pop(MDB_cursor *mc) } } -/** Push a page onto the top of the cursor's stack. - * Set #MDB_TXN_ERROR on failure. - */ +/** Push a page onto the top of the cursor's stack. */ static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { - DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DPRINTF(("pushing page %"Y"u on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *) mc)); if (mc->mc_snum >= CURSOR_STACK) { @@ -5410,8 +5673,295 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) return MDB_SUCCESS; } +#ifdef MDB_VL32 +/** Map a read-only page. + * There are two levels of tracking in use, a per-txn list and a per-env list. + * ref'ing and unref'ing the per-txn list is faster since it requires no + * locking. Pages are cached in the per-env list for global reuse, and a lock + * is required. Pages are not immediately unmapped when their refcnt goes to + * zero; they hang around in case they will be reused again soon. + * + * When the per-txn list gets full, all pages with refcnt=0 are purged from the + * list and their refcnts in the per-env list are decremented. + * + * When the per-env list gets full, all pages with refcnt=0 are purged from the + * list and their pages are unmapped. + * + * @note "full" means the list has reached its respective rpcheck threshold. + * This threshold slowly raises if no pages could be purged on a given check, + * and returns to its original value when enough pages were purged. + * + * If purging doesn't free any slots, filling the per-txn list will return + * MDB_TXN_FULL, and filling the per-env list returns MDB_MAP_FULL. + * + * Reference tracking in a txn is imperfect, pages can linger with non-zero + * refcnt even without active references. It was deemed to be too invasive + * to add unrefs in every required location. However, all pages are unref'd + * at the end of the transaction. This guarantees that no stale references + * linger in the per-env list. + * + * Usually we map chunks of 16 pages at a time, but if an overflow page begins + * at the tail of the chunk we extend the chunk to include the entire overflow + * page. Unfortunately, pages can be turned into overflow pages after their + * chunk was already mapped. In that case we must remap the chunk if the + * overflow page is referenced. If the chunk's refcnt is 0 we can just remap + * it, otherwise we temporarily map a new chunk just for the overflow page. + * + * @note this chunk handling means we cannot guarantee that a data item + * returned from the DB will stay alive for the duration of the transaction: + * We unref pages as soon as a cursor moves away from the page + * A subsequent op may cause a purge, which may unmap any unref'd chunks + * The caller must copy the data if it must be used later in the same txn. + * + * Also - our reference counting revolves around cursors, but overflow pages + * aren't pointed to by a cursor's page stack. We have to remember them + * explicitly, in the added mc_ovpg field. A single cursor can only hold a + * reference to one overflow page at a time. + * + * @param[in] txn the transaction for this access. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) +{ + MDB_env *env = txn->mt_env; + MDB_page *p; + MDB_ID3L tl = txn->mt_rpages; + MDB_ID3L el = env->me_rpages; + MDB_ID3 id3; + unsigned x, rem; + pgno_t pgno; + int rc, retries = 1; +#ifdef _WIN32 + LARGE_INTEGER off; + SIZE_T len; +#define SET_OFF(off,val) off.QuadPart = val +#define MAP(rc,env,addr,len,off) \ + addr = NULL; \ + rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), &addr, 0, \ + len, &off, &len, ViewUnmap, (env->me_flags & MDB_RDONLY) ? 0 : MEM_RESERVE, PAGE_READONLY); \ + if (rc) rc = mdb_nt2win32(rc) +#else + off64_t off; + size_t len; +#define SET_OFF(off,val) off = val +#define MAP(rc,env,addr,len,off) \ + addr = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); \ + rc = (addr == MAP_FAILED) ? errno : 0 +#endif + + /* remember the offset of the actual page number, so we can + * return the correct pointer at the end. + */ + rem = pg0 & (MDB_RPAGE_CHUNK-1); + pgno = pg0 ^ rem; + + id3.mid = 0; + x = mdb_mid3l_search(tl, pgno); + if (x <= tl[0].mid && tl[x].mid == pgno) { + if (x != tl[0].mid && tl[x+1].mid == pg0) + x++; + /* check for overflow size */ + p = (MDB_page *)((char *)tl[x].mptr + rem * env->me_psize); + if (IS_OVERFLOW(p) && p->mp_pages + rem > tl[x].mcnt) { + id3.mcnt = p->mp_pages + rem; + len = id3.mcnt * env->me_psize; + SET_OFF(off, pgno * env->me_psize); + MAP(rc, env, id3.mptr, len, off); + if (rc) + return rc; + /* check for local-only page */ + if (rem) { + mdb_tassert(txn, tl[x].mid != pg0); + /* hope there's room to insert this locally. + * setting mid here tells later code to just insert + * this id3 instead of searching for a match. + */ + id3.mid = pg0; + goto notlocal; + } else { + /* ignore the mapping we got from env, use new one */ + tl[x].mptr = id3.mptr; + tl[x].mcnt = id3.mcnt; + /* if no active ref, see if we can replace in env */ + if (!tl[x].mref) { + unsigned i; + pthread_mutex_lock(&env->me_rpmutex); + i = mdb_mid3l_search(el, tl[x].mid); + if (el[i].mref == 1) { + /* just us, replace it */ + munmap(el[i].mptr, el[i].mcnt * env->me_psize); + el[i].mptr = tl[x].mptr; + el[i].mcnt = tl[x].mcnt; + } else { + /* there are others, remove ourself */ + el[i].mref--; + } + pthread_mutex_unlock(&env->me_rpmutex); + } + } + } + id3.mptr = tl[x].mptr; + id3.mcnt = tl[x].mcnt; + tl[x].mref++; + goto ok; + } + +notlocal: + if (tl[0].mid >= MDB_TRPAGE_MAX - txn->mt_rpcheck) { + unsigned i, y; + /* purge unref'd pages from our list and unref in env */ + pthread_mutex_lock(&env->me_rpmutex); +retry: + y = 0; + for (i=1; i<=tl[0].mid; i++) { + if (!tl[i].mref) { + if (!y) y = i; + /* tmp overflow pages don't go to env */ + if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { + munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + continue; + } + x = mdb_mid3l_search(el, tl[i].mid); + el[x].mref--; + } + } + pthread_mutex_unlock(&env->me_rpmutex); + if (!y) { + /* we didn't find any unref'd chunks. + * if we're out of room, fail. + */ + if (tl[0].mid >= MDB_TRPAGE_MAX) + return MDB_TXN_FULL; + /* otherwise, raise threshold for next time around + * and let this go. + */ + txn->mt_rpcheck /= 2; + } else { + /* we found some unused; consolidate the list */ + for (i=y+1; i<= tl[0].mid; i++) + if (tl[i].mref) + tl[y++] = tl[i]; + tl[0].mid = y-1; + /* decrease the check threshold toward its original value */ + if (!txn->mt_rpcheck) + txn->mt_rpcheck = 1; + while (txn->mt_rpcheck < tl[0].mid && txn->mt_rpcheck < MDB_TRPAGE_SIZE/2) + txn->mt_rpcheck *= 2; + } + } + if (tl[0].mid < MDB_TRPAGE_SIZE) { + id3.mref = 1; + if (id3.mid) + goto found; + /* don't map past last written page in read-only envs */ + if ((env->me_flags & MDB_RDONLY) && pgno + MDB_RPAGE_CHUNK-1 > txn->mt_last_pgno) + id3.mcnt = txn->mt_last_pgno + 1 - pgno; + else + id3.mcnt = MDB_RPAGE_CHUNK; + len = id3.mcnt * env->me_psize; + id3.mid = pgno; + + /* search for page in env */ + pthread_mutex_lock(&env->me_rpmutex); + x = mdb_mid3l_search(el, pgno); + if (x <= el[0].mid && el[x].mid == pgno) { + id3.mptr = el[x].mptr; + id3.mcnt = el[x].mcnt; + /* check for overflow size */ + p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); + if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { + id3.mcnt = p->mp_pages + rem; + len = id3.mcnt * env->me_psize; + SET_OFF(off, pgno * env->me_psize); + MAP(rc, env, id3.mptr, len, off); + if (rc) + goto fail; + if (!el[x].mref) { + munmap(el[x].mptr, env->me_psize * el[x].mcnt); + el[x].mptr = id3.mptr; + el[x].mcnt = id3.mcnt; + } else { + id3.mid = pg0; + pthread_mutex_unlock(&env->me_rpmutex); + goto found; + } + } + el[x].mref++; + pthread_mutex_unlock(&env->me_rpmutex); + goto found; + } + if (el[0].mid >= MDB_ERPAGE_MAX - env->me_rpcheck) { + /* purge unref'd pages */ + unsigned i, y = 0; + for (i=1; i<=el[0].mid; i++) { + if (!el[i].mref) { + if (!y) y = i; + munmap(el[i].mptr, env->me_psize * el[i].mcnt); + } + } + if (!y) { + if (retries) { + /* see if we can unref some local pages */ + retries--; + id3.mid = 0; + goto retry; + } + if (el[0].mid >= MDB_ERPAGE_MAX) { + pthread_mutex_unlock(&env->me_rpmutex); + return MDB_MAP_FULL; + } + env->me_rpcheck /= 2; + } else { + for (i=y+1; i<= el[0].mid; i++) + if (el[i].mref) + el[y++] = el[i]; + el[0].mid = y-1; + if (!env->me_rpcheck) + env->me_rpcheck = 1; + while (env->me_rpcheck < el[0].mid && env->me_rpcheck < MDB_ERPAGE_SIZE/2) + env->me_rpcheck *= 2; + } + } + SET_OFF(off, pgno * env->me_psize); + MAP(rc, env, id3.mptr, len, off); + if (rc) { +fail: + pthread_mutex_unlock(&env->me_rpmutex); + return rc; + } + /* check for overflow size */ + p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); + if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { + id3.mcnt = p->mp_pages + rem; + munmap(id3.mptr, len); + len = id3.mcnt * env->me_psize; + MAP(rc, env, id3.mptr, len, off); + if (rc) + goto fail; + } + mdb_mid3l_insert(el, &id3); + pthread_mutex_unlock(&env->me_rpmutex); +found: + mdb_mid3l_insert(tl, &id3); + } else { + return MDB_TXN_FULL; + } +ok: + p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); +#if MDB_DEBUG /* we don't need this check any more */ + if (IS_OVERFLOW(p)) { + mdb_tassert(txn, p->mp_pages + rem <= id3.mcnt); + } +#endif + *ret = p; + return MDB_SUCCESS; +} +#endif + /** Find the address of the page corresponding to a given page number. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. @@ -5422,11 +5972,13 @@ static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { MDB_txn *txn = mc->mc_txn; +#ifndef MDB_VL32 MDB_env *env = txn->mt_env; +#endif MDB_page *p = NULL; int level; - if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { + if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) { MDB_txn *tx2 = txn; level = 1; do { @@ -5441,7 +5993,13 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) MDB_ID pn = pgno << 1; x = mdb_midl_search(tx2->mt_spill_pgs, pn); if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { +#ifdef MDB_VL32 + int rc = mdb_rpage_get(txn, pgno, &p); + if (rc) + return rc; +#else p = (MDB_page *)(env->me_map + env->me_psize * pgno); +#endif goto done; } } @@ -5458,9 +6016,17 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) if (pgno < txn->mt_next_pgno) { level = 0; +#ifdef MDB_VL32 + { + int rc = mdb_rpage_get(txn, pgno, &p); + if (rc) + return rc; + } +#else p = (MDB_page *)(env->me_map + env->me_psize * pgno); +#endif } else { - DPRINTF(("page %"Z"u not found", pgno)); + DPRINTF(("page %"Y"u not found", pgno)); txn->mt_flags |= MDB_TXN_ERROR; return MDB_PAGE_NOTFOUND; } @@ -5486,27 +6052,18 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) MDB_node *node; indx_t i; - DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + DPRINTF(("branch page %"Y"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); /* Don't assert on branch pages in the FreeDB. We can get here * while in the process of rebalancing a FreeDB branch page; we must * let that proceed. ITS#8336 */ mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + DPRINTF(("found index 0 to page %"Y"u", NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { i = 0; - if (flags & MDB_PS_LAST) { + if (flags & MDB_PS_LAST) i = NUMKEYS(mp) - 1; - /* if already init'd, see if we're already in right place */ - if (mc->mc_flags & C_INITIALIZED) { - if (mc->mc_ki[mc->mc_top] == i) { - mc->mc_top = mc->mc_snum++; - mp = mc->mc_pg[mc->mc_top]; - goto ready; - } - } - } } else { int exact; node = mdb_node_search(mc, key, &exact); @@ -5532,7 +6089,6 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) if ((rc = mdb_cursor_push(mc, mp))) return rc; -ready: if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc)) != 0) return rc; @@ -5547,7 +6103,7 @@ ready: return MDB_CORRUPTED; } - DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + DPRINTF(("found leaf page %"Y"u for key [%s]", mp->mp_pgno, key ? DKEY(key) : "null")); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -5643,14 +6199,26 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) } mdb_cassert(mc, root > 1); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { +#ifdef MDB_VL32 + if (mc->mc_pg[0]) + MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[0]); +#endif if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) return rc; + } +#ifdef MDB_VL32 + { + int i; + for (i=1; imc_snum; i++) + MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[i]); + } +#endif mc->mc_snum = 1; mc->mc_top = 0; - DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DPRINTF(("db %d root page %"Y"u has flags 0x%X", DDBI(mc), root, mc->mc_pg[0]->mp_flags)); if (flags & MDB_PS_MODIFY) { @@ -5675,7 +6243,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) MDB_ID pn = pg << 1; int rc; - DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + DPRINTF(("free ov page %"Y"u (%d)", pg, ovpages)); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. * Otherwise put it onto the list of pages we freed in this txn. @@ -5736,6 +6304,10 @@ release: if (rc) return rc; } +#ifdef MDB_VL32 + if (mc->mc_ovpg == mp) + mc->mc_ovpg = NULL; +#endif mc->mc_db->md_overflow_pages -= ovpages; return 0; } @@ -5753,6 +6325,12 @@ mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) pgno_t pgno; int rc; +#ifdef MDB_VL32 + if (mc->mc_ovpg) { + MDB_PAGE_UNREF(mc->mc_txn, mc->mc_ovpg); + mc->mc_ovpg = 0; + } +#endif if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { data->mv_size = NODEDSZ(leaf); data->mv_data = NODEDATA(leaf); @@ -5764,10 +6342,13 @@ mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Z"u failed", pgno)); + DPRINTF(("read overflow page %"Y"u failed", pgno)); return rc; } data->mv_data = METADATA(omp); +#ifdef MDB_VL32 + mc->mc_ovpg = omp; +#endif return MDB_SUCCESS; } @@ -5778,7 +6359,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, { MDB_cursor mc; MDB_xcursor mx; - int exact = 0; + int exact = 0, rc; DKBUF; DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); @@ -5790,7 +6371,16 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, return MDB_BAD_TXN; mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); + rc = mdb_cursor_set(&mc, key, data, MDB_SET, &exact); +#ifdef MDB_VL32 + { + /* unref all the pages - caller must copy the data + * before doing anything else + */ + mdb_cursor_unref(&mc); + } +#endif + return rc; } /** Find a sibling for a page. @@ -5807,13 +6397,19 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) int rc; MDB_node *indx; MDB_page *mp; +#ifdef MDB_VL32 + MDB_page *op; +#endif if (mc->mc_snum < 2) { return MDB_NOTFOUND; /* root has no siblings */ } +#ifdef MDB_VL32 + op = mc->mc_pg[mc->mc_top]; +#endif mdb_cursor_pop(mc); - DPRINTF(("parent page is page %"Z"u, index %u", + DPRINTF(("parent page is page %"Y"u, index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) @@ -5836,6 +6432,8 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) } mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + MDB_PAGE_UNREF(mc->mc_txn, op); + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { /* mc will be inconsistent if caller does mc_snum++ as above */ @@ -5858,20 +6456,14 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) + if (mc->mc_flags & C_EOF) { return MDB_NOTFOUND; + } - if (!(mc->mc_flags & C_INITIALIZED)) - return mdb_cursor_first(mc, key, data); + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) - return MDB_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - if (mc->mc_db->md_flags & MDB_DUPSORT) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5883,6 +6475,13 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } +#ifdef MDB_VL32 + else { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } + } +#endif } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_NEXT_DUP) @@ -5890,7 +6489,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + DPRINTF(("cursor_next: top page is %"Y"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; @@ -5904,12 +6503,12 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } mp = mc->mc_pg[mc->mc_top]; - DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + DPRINTF(("next page is %"Y"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); } else mc->mc_ki[mc->mc_top]++; skip: - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + DPRINTF(("==> cursor points to page %"Y"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { @@ -5947,12 +6546,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - return rc; - mc->mc_ki[mc->mc_top]++; - } + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); mp = mc->mc_pg[mc->mc_top]; @@ -5969,6 +6563,13 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } +#ifdef MDB_VL32 + else { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } + } +#endif } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_PREV_DUP) @@ -5976,7 +6577,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + DPRINTF(("cursor_prev: top page is %"Y"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); mc->mc_flags &= ~(C_EOF|C_DEL); @@ -5988,11 +6589,13 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + DPRINTF(("prev page is %"Y"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); } else mc->mc_ki[mc->mc_top]--; - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mc->mc_flags &= ~C_EOF; + + DPRINTF(("==> cursor points to page %"Y"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { @@ -6035,8 +6638,14 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (key->mv_size == 0) return MDB_BAD_VALSIZE; - if (mc->mc_xcursor) + if (mc->mc_xcursor) { +#ifdef MDB_VL32 + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } +#endif mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } /* See if we're already on the right page */ if (mc->mc_flags & C_INITIALIZED) { @@ -6102,7 +6711,6 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } rc = 0; - mc->mc_flags &= ~C_EOF; goto set2; } } @@ -6194,8 +6802,8 @@ set1: if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) return rc; dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(mdb_size_t)) dcmp = mdb_cmp_clong; #endif rc = dcmp(data, &olddata); @@ -6229,8 +6837,14 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) int rc; MDB_node *leaf; - if (mc->mc_xcursor) + if (mc->mc_xcursor) { +#ifdef MDB_VL32 + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } +#endif mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); @@ -6273,16 +6887,25 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) int rc; MDB_node *leaf; - if (mc->mc_xcursor) + if (mc->mc_xcursor) { +#ifdef MDB_VL32 + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } +#endif mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + if (!(mc->mc_flags & C_EOF)) { + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + } mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED|C_EOF; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); @@ -6396,7 +7019,10 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_INCOMPATIBLE; break; } - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); if (rc == MDB_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDB_cursor *mx; @@ -6438,11 +7064,21 @@ fetchm: case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - rc = mdb_cursor_next(mc, key, data, op); + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: case MDB_PREV_DUP: case MDB_PREV_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + break; + mc->mc_flags |= C_INITIALIZED; + mc->mc_ki[mc->mc_top]++; + } rc = mdb_cursor_prev(mc, key, data, op); break; case MDB_FIRST: @@ -6459,11 +7095,6 @@ fetchm: rc = MDB_INCOMPATIBLE; break; } - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - rc = MDB_NOTFOUND; - break; - } { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -6505,8 +7136,7 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { - /* Touch DB record of named DB */ + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -6731,8 +7361,8 @@ more: if (flags == MDB_CURRENT) goto current; dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(mdb_size_t)) dcmp = mdb_cmp_clong; #endif /* does data match? */ @@ -6830,9 +7460,8 @@ prep_subDB: } else { memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, olddata.mv_size - fp->mp_upper - PAGEBASE); - memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); for (i=0; imp_ptrs[i] += offset; + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; } } @@ -6886,13 +7515,8 @@ current: /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); - /* Currently we make the page look as with put() in the - * parent txn, in case the user peeks at MDB_RESERVEd - * or unused parts. Some users treat ovpages specially. - */ if (!(flags & MDB_RESERVE)) { - /* Skip the part where LMDB will put *data. - * Copy end of page, adjusting alignment so + /* Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); @@ -6961,7 +7585,11 @@ new_sub: if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { m3->mc_ki[i]++; } - XCURSOR_REFRESH(m3, i, mp); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + MDB_node *n2 = NODEPTR(mp, m3->mc_ki[i]); + if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } } } } @@ -6974,7 +7602,7 @@ new_sub: */ if (do_sub) { int xflags, new_dupdata; - size_t ecount; + mdb_size_t ecount; put_sub: xdata.mv_size = 0; xdata.mv_data = ""; @@ -7003,6 +7631,7 @@ put_sub: MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; @@ -7010,8 +7639,10 @@ put_sub: if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key) { - XCURSOR_REFRESH(m2, i, mp); + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); + if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); } } } @@ -7116,7 +7747,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - XCURSOR_REFRESH(m2, mc->mc_top, mp); + if (m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } else { + MDB_node *n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (!(n2->mn_flags & F_SUBDATA)) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } } } } @@ -7161,7 +7798,6 @@ fail: } /** Allocate and initialize new pages for a database. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc a cursor on the database being added to. * @param[in] flags flags defining what type of page is being allocated. * @param[in] num the number of pages to allocate. This is usually 1, @@ -7177,7 +7813,7 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) if ((rc = mdb_page_alloc(mc, num, &np))) return rc; - DPRINTF(("allocated new mpage %"Z"u, page size %u", + DPRINTF(("allocated new mpage %"Y"u, page size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize)); np->mp_flags = flags | P_DIRTY; np->mp_lower = (PAGEHDRSZ-PAGEBASE); @@ -7247,7 +7883,6 @@ mdb_branch_size(MDB_env *env, MDB_val *key) } /** Add a node to the page pointed to by the cursor. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc The cursor for this operation. * @param[in] indx The index on the page where the new node should be added. * @param[in] key The key for the new node. @@ -7278,7 +7913,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + DPRINTF(("add to %s %spage %"Y"u index %i, data size %"Z"u key size %"Z"u [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, @@ -7319,7 +7954,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, goto full; if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; - DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + DPRINTF(("allocated overflow page %"Y"u", ofp->mp_pgno)); flags |= F_BIGDATA; goto update; } else { @@ -7376,7 +8011,7 @@ update: return MDB_SUCCESS; full: - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + DPRINTF(("not enough room in page %"Y"u, got %u ptrs", mdb_dbg_pgno(mp), NUMKEYS(mp))); DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); DPRINTF(("node size = %"Z"u", node_size)); @@ -7399,7 +8034,7 @@ mdb_node_del(MDB_cursor *mc, int ksize) MDB_node *node; char *base; - DPRINTF(("delete node %u on %s page %"Z"u", indx, + DPRINTF(("delete node %u on %s page %"Y"u", indx, IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); numkeys = NUMKEYS(mp); mdb_cassert(mc, indx < numkeys); @@ -7508,7 +8143,10 @@ mdb_xcursor_init0(MDB_cursor *mc) mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; +#ifdef MDB_VL32 + mx->mx_cursor.mc_ovpg = 0; +#endif + mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP)); mx->mx_dbx.md_name.mv_size = 0; mx->mx_dbx.md_name.mv_data = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; @@ -7527,12 +8165,12 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) { MDB_xcursor *mx = mc->mc_xcursor; + mx->mx_cursor.mc_flags &= C_SUB|C_ORIG_RDONLY|C_WRITEMAP; if (node->mn_flags & F_SUBDATA) { memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; } else { MDB_page *fp = NODEDATA(node); mx->mx_db.md_pad = 0; @@ -7545,7 +8183,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { @@ -7555,11 +8193,11 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + DPRINTF(("Sub-db -%u root page %"Y"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; -#if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(mdb_size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; #endif } @@ -7583,7 +8221,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ #if UINT_MAX < SIZE_MAX mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; #endif @@ -7592,7 +8230,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) } mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + DPRINTF(("Sub-db -%u root page %"Y"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); } @@ -7611,7 +8249,10 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) mc->mc_top = 0; mc->mc_pg[0] = 0; mc->mc_ki[0] = 0; - mc->mc_flags = 0; +#ifdef MDB_VL32 + mc->mc_ovpg = 0; +#endif + mc->mc_flags = txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { mdb_tassert(txn, mx != NULL); mc->mc_xcursor = mx; @@ -7676,7 +8317,7 @@ mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) /* Return the count of duplicate data items for the current key */ int -mdb_cursor_count(MDB_cursor *mc, size_t *countp) +mdb_cursor_count(MDB_cursor *mc, mdb_size_t *countp) { MDB_node *leaf; @@ -7692,15 +8333,9 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; - if (!mc->mc_snum) + if (!mc->mc_snum || (mc->mc_flags & C_EOF)) return MDB_NOTFOUND; - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDB_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { *countp = 1; @@ -7716,6 +8351,11 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) void mdb_cursor_close(MDB_cursor *mc) { +#ifdef MDB_VL32 + if (mc) { + mdb_cursor_unref(mc); + } +#endif if (mc && !mc->mc_backup) { /* remove from txn, if tracked */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { @@ -7742,7 +8382,6 @@ mdb_cursor_dbi(MDB_cursor *mc) } /** Replace the key for a branch node with a new key. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -7768,7 +8407,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) char kbuf2[DKBUF_MAXKEYSIZE*2+1]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; - DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Y"u", indx, ptr, mdb_dkey(&k2, kbuf2), DKEY(key), @@ -7916,7 +8555,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) return rc; } - DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + DPRINTF(("moving %s node %u [%s] on page %"Y"u to node %u on page %"Y"u", IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", csrc->mc_ki[csrc->mc_top], DKEY(&key), @@ -7962,8 +8601,12 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; m3->mc_ki[csrc->mc_top-1]++; } - if (IS_LEAF(mps)) - XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(mps)) { + MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } else /* Adding on the right, bump others down */ @@ -7984,8 +8627,12 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) } else { m3->mc_ki[csrc->mc_top]--; } - if (IS_LEAF(mps)) - XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(mps)) { + MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } } @@ -8002,7 +8649,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF(("update separator for source page %"Z"u to [%s]", + DPRINTF(("update separator for source page %"Y"u to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); mdb_cursor_copy(csrc, &mn); mn.mc_snum--; @@ -8033,7 +8680,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF(("update separator for destination page %"Z"u to [%s]", + DPRINTF(("update separator for destination page %"Y"u to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); mdb_cursor_copy(cdst, &mn); mn.mc_snum--; @@ -8079,7 +8726,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); + DPRINTF(("merging page %"Y"u into %"Y"u", psrc->mp_pgno, pdst->mp_pgno)); mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ mdb_cassert(csrc, cdst->mc_snum > 1); @@ -8136,7 +8783,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } } - DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + DPRINTF(("dst page %"Y"u now has %u keys (%.1f%% filled)", pdst->mp_pgno, NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); @@ -8186,8 +8833,12 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { m3->mc_ki[top-1]--; } - if (IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(psrc)) { + MDB_node *node = NODEPTR(m3->mc_pg[top], m3->mc_ki[top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } { @@ -8220,6 +8871,9 @@ mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; +#ifdef MDB_VL32 + cdst->mc_ovpg = csrc->mc_ovpg; +#endif for (i=0; imc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; @@ -8248,14 +8902,14 @@ mdb_rebalance(MDB_cursor *mc) minkeys = 1; thresh = FILL_THRESHOLD; } - DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + DPRINTF(("rebalancing %s page %"Y"u (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + DPRINTF(("no need to rebalance page %"Y"u, above fill threshold", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); return MDB_SUCCESS; } @@ -8384,7 +9038,7 @@ mdb_rebalance(MDB_cursor *mc) fromleft = 1; } - DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + DPRINTF(("found neighbor page %"Y"u (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); @@ -8441,15 +9095,16 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - } - continue; + if (mc->mc_db->md_flags & MDB_DUPSORT) + m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else if (m3->mc_ki[mc->mc_top] > ki) { m3->mc_ki[mc->mc_top]--; } - XCURSOR_REFRESH(m3, mc->mc_top, mp); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } } @@ -8475,32 +9130,11 @@ mdb_cursor_del0(MDB_cursor *mc) continue; if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - continue; - } - } - if (mc->mc_db->md_flags & MDB_DUPSORT) { - MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. - * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. - */ - if (node->mn_flags & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node->mn_flags & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } else { - mdb_xcursor_init1(m3, node); - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - } + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; } } } @@ -8575,7 +9209,6 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, } /** Split a page and insert a new node. - * Set #MDB_TXN_ERROR on failure. * @param[in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. @@ -8607,7 +9240,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno newindx = mc->mc_ki[mc->mc_top]; nkeys = NUMKEYS(mp); - DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + DPRINTF(("-----> splitting %s page %"Y"u and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); @@ -8615,7 +9248,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; rp->mp_pad = mp->mp_pad; - DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + DPRINTF(("new right sibling: page %"Y"u", rp->mp_pgno)); /* Usually when splitting the root page, the cursor * height is 1. But when called from mdb_update_key, @@ -8633,7 +9266,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + DPRINTF(("root split! new root = %"Y"u", pp->mp_pgno)); new_root = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ @@ -8650,7 +9283,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno ptop = 0; } else { ptop = mc->mc_top-1; - DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + DPRINTF(("parent branch page is %"Y"u", mc->mc_pg[ptop]->mp_pgno)); } mdb_cursor_copy(mc, &mn); @@ -8749,7 +9382,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno * the split so the new page is emptier than the old page. * This yields better packing during sequential inserts. */ - if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) { + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { /* Find split point */ psize = 0; if (newindx <= split_indx || newindx >= nkeys) { @@ -8985,8 +9618,12 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { m3->mc_ki[ptop]++; } - if (IS_LEAF(mp)) - XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(mp)) { + MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); @@ -9027,26 +9664,23 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, #ifndef MDB_WBUF #define MDB_WBUF (1024*1024) #endif -#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - /** State needed for a double-buffering compacting copy. */ + /** State needed for a compacting copy. */ typedef struct mdb_copy { - MDB_env *mc_env; - MDB_txn *mc_txn; pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + pthread_cond_t mc_cond; char *mc_wbuf[2]; char *mc_over[2]; + MDB_env *mc_env; + MDB_txn *mc_txn; int mc_wlen[2]; int mc_olen[2]; pgno_t mc_next_pgno; HANDLE mc_fd; - int mc_toggle; /**< Buffer number in provider */ - int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - /** Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, LMDB expects atomic int. - */ - volatile int mc_error; + int mc_status; + volatile int mc_new; + int mc_toggle; + } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -9062,38 +9696,26 @@ mdb_env_copythr(void *arg) #else int len; #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) -#ifdef SIGPIPE - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) - my->mc_error = rc; -#endif #endif pthread_mutex_lock(&my->mc_mutex); + my->mc_new = 0; + pthread_cond_signal(&my->mc_cond); for(;;) { while (!my->mc_new) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + if (my->mc_new < 0) { + my->mc_new = 0; break; + } + my->mc_new = 0; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: - rc = MDB_SUCCESS; - while (wsize > 0 && !my->mc_error) { + while (wsize > 0) { DO_WRITE(rc, my->mc_fd, ptr, wsize, len); if (!rc) { rc = ErrCode(); -#if defined(SIGPIPE) && !defined(_WIN32) - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). - */ - int tmp; - sigwait(&set, &tmp); - } -#endif break; } else if (len > 0) { rc = MDB_SUCCESS; @@ -9106,7 +9728,8 @@ again: } } if (rc) { - my->mc_error = rc; + my->mc_status = rc; + break; } /* If there's an overflow page tail, write it too */ if (my->mc_olen[toggle]) { @@ -9117,45 +9740,38 @@ again: } my->mc_wlen[toggle] = 0; toggle ^= 1; - /* Return the empty buffer to provider */ - my->mc_new--; pthread_cond_signal(&my->mc_cond); } + pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); return (THREAD_RET)0; #undef DO_WRITE } - /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. - * - * @param[in] my control structure. - * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). - */ + /** Tell the writer thread there's a buffer ready to write */ static int ESECT -mdb_env_cthr_toggle(mdb_copy *my, int adjust) +mdb_env_cthr_toggle(mdb_copy *my, int st) { + int toggle = my->mc_toggle ^ 1; pthread_mutex_lock(&my->mc_mutex); - my->mc_new += adjust; - pthread_cond_signal(&my->mc_cond); - while (my->mc_new & 2) /* both buffers in use */ + if (my->mc_status) { + pthread_mutex_unlock(&my->mc_mutex); + return my->mc_status; + } + while (my->mc_new == 1) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + my->mc_new = st; + my->mc_toggle = toggle; + pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); - - my->mc_toggle ^= (adjust & 1); - /* Both threads reset mc_wlen, to be safe from threading errors */ - my->mc_wlen[my->mc_toggle] = 0; - return my->mc_error; + return 0; } - /** Depth-first tree traversal for compacting copy. - * @param[in] my control structure. - * @param[in,out] pg database root. - * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. - */ + /** Depth-first tree traversal for compacting copy. */ static int ESECT mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { - MDB_cursor mc = {0}; + MDB_cursor mc; MDB_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; @@ -9167,6 +9783,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) return MDB_SUCCESS; mc.mc_snum = 1; + mc.mc_top = 0; mc.mc_txn = my->mc_txn; rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); @@ -9213,7 +9830,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) } memcpy(&pg, NODEDATA(ni), sizeof(pg)); - memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); rc = mdb_page_get(&mc, pg, &omp, NULL); if (rc) goto done; @@ -9236,6 +9852,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) goto done; toggle = my->mc_toggle; } + memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); } else if (ni->mn_flags & F_SUBDATA) { MDB_db db; @@ -9313,56 +9930,47 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) { MDB_meta *mm; MDB_page *mp; - mdb_copy my = {0}; + mdb_copy my; MDB_txn *txn = NULL; pthread_t thr; - pgno_t root, new_root; - int rc = MDB_SUCCESS; + int rc; #ifdef _WIN32 - if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || - !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { - rc = ErrCode(); - goto done; - } + my.mc_mutex = CreateMutex(NULL, FALSE, NULL); + my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); - if (my.mc_wbuf[0] == NULL) { - /* _aligned_malloc() sets errno, but we use Windows error codes */ - rc = ERROR_NOT_ENOUGH_MEMORY; - goto done; - } + if (my.mc_wbuf[0] == NULL) + return errno; #else - if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) - return rc; - if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) - goto done2; + pthread_mutex_init(&my.mc_mutex, NULL); + pthread_cond_init(&my.mc_cond, NULL); #ifdef HAVE_MEMALIGN my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) { - rc = errno; - goto done; - } + if (my.mc_wbuf[0] == NULL) + return errno; #else - { - void *p; - if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) - goto done; - my.mc_wbuf[0] = p; - } + rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); + if (rc) + return rc; #endif #endif memset(my.mc_wbuf[0], 0, MDB_WBUF*2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_wlen[0] = 0; + my.mc_wlen[1] = 0; + my.mc_olen[0] = 0; + my.mc_olen[1] = 0; my.mc_next_pgno = NUM_METAS; + my.mc_status = 0; + my.mc_new = 1; + my.mc_toggle = 0; my.mc_env = env; my.mc_fd = fd; - rc = THREAD_CREATE(thr, mdb_env_copythr, &my); - if (rc) - goto done; + THREAD_CREATE(thr, mdb_env_copythr, &my); rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) - goto finish; + return rc; mp = (MDB_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); @@ -9378,64 +9986,57 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) *(MDB_meta *)METADATA(mp) = *mm; mm = (MDB_meta *)METADATA(mp); - /* Set metapage 1 with current main DB */ - root = new_root = txn->mt_dbs[MAIN_DBI].md_root; - if (root != P_INVALID) { - /* Count free pages + freeDB pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. - */ + /* Count the number of free pages, subtract from lastpg to find + * number of active pages + */ + { MDB_ID freecount = 0; MDB_cursor mc; MDB_val key, data; mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; - if (rc != MDB_NOTFOUND) - goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_overflow_pages; - new_root = txn->mt_next_pgno - 1 - freecount; - mm->mm_last_pg = new_root; + /* Set metapage 1 */ + mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - mm->mm_dbs[MAIN_DBI].md_root = new_root; - } else { - /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. - */ - mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + if (mm->mm_last_pg > NUM_METAS-1) { + mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; + mm->mm_txnid = 1; + } else { + mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; + } } - if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { - mm->mm_txnid = 1; /* use metapage 1 */ - } - my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; - rc = mdb_env_cwalk(&my, &root, 0); - if (rc == MDB_SUCCESS && root != new_root) { - rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ - } + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); + if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) + rc = mdb_env_cthr_toggle(&my, 1); + mdb_env_cthr_toggle(&my, -1); + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + THREAD_FINISH(thr); -finish: - if (rc) - my.mc_error = rc; - mdb_env_cthr_toggle(&my, 1 | MDB_EOF); - rc = THREAD_FINISH(thr); mdb_txn_abort(txn); - -done: #ifdef _WIN32 - if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); - if (my.mc_cond) CloseHandle(my.mc_cond); - if (my.mc_mutex) CloseHandle(my.mc_mutex); + CloseHandle(my.mc_cond); + CloseHandle(my.mc_mutex); + _aligned_free(my.mc_wbuf[0]); #else - free(my.mc_wbuf[0]); pthread_cond_destroy(&my.mc_cond); -done2: pthread_mutex_destroy(&my.mc_mutex); + free(my.mc_wbuf[0]); #endif - return rc ? rc : my.mc_error; + return rc; } /** Copy environment as-is. */ @@ -9445,7 +10046,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) MDB_txn *txn = NULL; mdb_mutexref_t wmutex = NULL; int rc; - size_t wsize, w3; + mdb_size_t wsize, w3; char *ptr; #ifdef _WIN32 DWORD len, w2; @@ -9506,7 +10107,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) w3 = txn->mt_next_pgno * env->me_psize; { - size_t fsize = 0; + mdb_size_t fsize = 0; if ((rc = mdb_fsize(env->me_fd, &fsize))) goto leave; if (w3 > fsize) @@ -9556,20 +10157,67 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) int ESECT mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) { - int rc; - MDB_name fname; + int rc, len; + char *lpath; HANDLE newfd = INVALID_HANDLE_VALUE; +#ifdef _WIN32 + wchar_t *wpath; +#endif - rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); - if (rc == MDB_SUCCESS) { - rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); - mdb_fname_destroy(fname); + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); } - if (rc == MDB_SUCCESS) { - rc = mdb_env_copyfd2(env, newfd, flags); + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ +#ifdef _WIN32 + rc = utf8_to_utf16(lpath, -1, &wpath, NULL); + if (rc) + goto leave; + newfd = CreateFileW(wpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); + free(wpath); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); +#endif + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + + if (env->me_psize >= env->me_os_psize) { +#ifdef O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif +#ifdef F_NOCACHE /* __APPLE__ */ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + } + + rc = mdb_env_copyfd2(env, newfd, flags); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) if (close(newfd) < 0 && rc == MDB_SUCCESS) rc = ErrCode(); - } + return rc; } @@ -9791,11 +10439,8 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) return MDB_INCOMPATIBLE; - } else { - if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) - return rc; - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EACCES; + } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { + return rc; } /* Done here so we cannot fail after creating a new DB */ @@ -9809,8 +10454,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; dummy.md_flags = flags & PERSISTENT_FLAGS; - WITH_CURSOR_TRACKING(mc, - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); dbflag |= DB_DIRTY; } @@ -9910,6 +10554,11 @@ mdb_drop0(MDB_cursor *mc, int subs) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); +#ifdef MDB_VL32 + /* bump refcount for mx's pages */ + for (i=0; imc_snum; i++) + mdb_page_get(&mx, mc->mc_pg[i]->mp_pgno, &mx.mc_pg[i], NULL); +#endif while (mc->mc_snum > 0) { MDB_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = NUMKEYS(mp); @@ -9975,6 +10624,10 @@ pop: done: if (rc) txn->mt_flags |= MDB_TXN_ERROR; +#ifdef MDB_VL32 + /* drop refcount for mx's pages */ + mdb_cursor_unref(&mx); +#endif } else if (rc == MDB_NOTFOUND) { rc = MDB_SUCCESS; } @@ -10094,7 +10747,7 @@ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; sprintf(buf, txnid == (txnid_t)-1 ? - "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", + "%10d %"Z"x -\n" : "%10d %"Z"x %"Y"u\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); if (first) { first = 0; @@ -10163,7 +10816,7 @@ mdb_reader_check(MDB_env *env, int *dead) return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; } -/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +/** As #mdb_reader_check(). rlocked = . */ static int ESECT mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { @@ -10199,7 +10852,7 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead) } for (; jmn_alloced = 1; - dst->mn_len = need - 1; - dst->mn_val = result; - return MDB_SUCCESS; - } + int need; + wchar_t *result; + need = MultiByteToWideChar(CP_UTF8, 0, src, srcsize, NULL, 0); + if (need == 0xFFFD) + return EILSEQ; + if (need == 0) + return EINVAL; + result = malloc(sizeof(wchar_t) * need); + if (!result) + return ENOMEM; + MultiByteToWideChar(CP_UTF8, 0, src, srcsize, result, need); + if (dstsize) + *dstsize = need; + *dst = result; + return 0; } #endif /* defined(_WIN32) */ -/** @} */ diff --git a/contrib/db/liblmdb/mdb_copy.1 b/contrib/db/liblmdb/mdb_copy.1 index 594ff124..401e47ab 100644 --- a/contrib/db/liblmdb/mdb_copy.1 +++ b/contrib/db/liblmdb/mdb_copy.1 @@ -1,5 +1,5 @@ -.TH MDB_COPY 1 "2014/07/01" "LMDB 0.9.14" -.\" Copyright 2012-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_COPY 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_copy \- LMDB environment copy tool @@ -11,6 +11,8 @@ mdb_copy \- LMDB environment copy tool .BR \-c ] [\c .BR \-n ] +[\c +.BR \-v ] .B srcpath [\c .BR dstpath ] @@ -36,10 +38,13 @@ Write the library version number to the standard output, and exit. Compact while copying. Only current data pages will be copied; freed or unused pages will be omitted from the copy. This option will slow down the backup process as it is more CPU-intensive. -Currently it fails if the environment has suffered a page leak. .TP .BR \-n Open LDMB environment(s) which do not use subdirectories. +.TP +.BR \-v +Use the previous environment state instead of the latest state. +This may be useful if the latest state has been corrupted. .SH DIAGNOSTICS Exit status is zero if no errors occur. diff --git a/contrib/db/liblmdb/mdb_copy.c b/contrib/db/liblmdb/mdb_copy.c index 1b89396e..95a6e713 100644 --- a/contrib/db/liblmdb/mdb_copy.c +++ b/contrib/db/liblmdb/mdb_copy.c @@ -1,6 +1,6 @@ /* mdb_copy.c - memory-mapped database backup tool */ /* - * Copyright 2012-2018 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,6 +38,8 @@ int main(int argc,char * argv[]) for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') flags |= MDB_NOSUBDIR; + else if (argv[1][1] == 'v' && argv[1][2] == '\0') + flags |= MDB_PREVSNAPSHOT; else if (argv[1][1] == 'c' && argv[1][2] == '\0') cpflags |= MDB_CP_COMPACT; else if (argv[1][1] == 'V' && argv[1][2] == '\0') { @@ -48,7 +50,7 @@ int main(int argc,char * argv[]) } if (argc<2 || argc>3) { - fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); + fprintf(stderr, "usage: %s [-V] [-c] [-n] [-v] srcpath [dstpath]\n", progname); exit(EXIT_FAILURE); } diff --git a/contrib/db/liblmdb/mdb_dump.1 b/contrib/db/liblmdb/mdb_dump.1 index 72cf6ca8..a25fb92e 100644 --- a/contrib/db/liblmdb/mdb_dump.1 +++ b/contrib/db/liblmdb/mdb_dump.1 @@ -1,5 +1,5 @@ -.TH MDB_DUMP 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2014-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_DUMP 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_dump \- LMDB environment export tool @@ -14,6 +14,8 @@ mdb_dump \- LMDB environment export tool [\c .BR \-n ] [\c +.BR \-v ] +[\c .BR \-p ] [\c .BR \-a \ | @@ -42,6 +44,10 @@ names will be listed, no data will be output. .BR \-n Dump an LMDB database which does not use subdirectories. .TP +.BR \-v +Use the previous environment state instead of the latest state. +This may be useful if the latest state has been corrupted. +.TP .BR \-p If characters in either the key or data items are printing characters (as defined by isprint(3)), output them directly. This option permits users to diff --git a/contrib/db/liblmdb/mdb_dump.c b/contrib/db/liblmdb/mdb_dump.c index 9df5dc0b..7a42bc0b 100644 --- a/contrib/db/liblmdb/mdb_dump.c +++ b/contrib/db/liblmdb/mdb_dump.c @@ -1,6 +1,6 @@ /* mdb_dump.c - memory-mapped database dump tool */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,6 +25,15 @@ #else #define Z "z" #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif #define PRINT 1 static int mode; @@ -115,7 +124,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) if (name) printf("database=%s\n", name); printf("type=btree\n"); - printf("mapsize=%" Z "u\n", info.me_mapsize); + printf("mapsize=%" Y "u\n", info.me_mapsize); if (info.me_mapaddr) printf("mapaddr=%p\n", info.me_mapaddr); printf("maxreaders=%u\n", info.me_maxreaders); @@ -155,7 +164,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-v] [-a|-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -179,6 +188,7 @@ int main(int argc, char *argv[]) * -n: use NOSUBDIR flag on env_open * -p: use printable characters * -f: write to file instead of stdout + * -v: use previous snapshot * -V: print version and exit * (default) dump only the main DB */ @@ -206,6 +216,9 @@ int main(int argc, char *argv[]) case 'n': envflags |= MDB_NOSUBDIR; break; + case 'v': + envflags |= MDB_PREVSNAPSHOT; + break; case 'p': mode |= PRINT; break; diff --git a/contrib/db/liblmdb/mdb_load.1 b/contrib/db/liblmdb/mdb_load.1 index 998acc12..ede3702d 100644 --- a/contrib/db/liblmdb/mdb_load.1 +++ b/contrib/db/liblmdb/mdb_load.1 @@ -1,5 +1,5 @@ -.TH MDB_LOAD 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2014-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_LOAD 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_load \- LMDB environment import tool @@ -37,6 +37,13 @@ option below. .BR \-V Write the library version number to the standard output, and exit. .TP +.BR \-a +Append all records in the order they appear in the input. The input is assumed to already be +in correctly sorted order and no sorting or checking for redundant values will be performed. +This option must be used to reload data that was produced by running +.B mdb_dump +on a database that uses custom compare functions. +.TP .BR \-f \ file Read from the specified file instead of from the standard input. .TP diff --git a/contrib/db/liblmdb/mdb_load.c b/contrib/db/liblmdb/mdb_load.c index 0f177f1e..797c2f97 100644 --- a/contrib/db/liblmdb/mdb_load.c +++ b/contrib/db/liblmdb/mdb_load.c @@ -1,6 +1,6 @@ /* mdb_load.c - memory-mapped database load tool */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,12 +37,22 @@ static int Eof; static MDB_envinfo info; static MDB_val kbuf, dbuf; +static MDB_val k0buf; #ifdef _WIN32 #define Z "I" #else #define Z "z" #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif #define STRLENOF(s) (sizeof(s)-1) @@ -68,7 +78,6 @@ static void readhdr(void) { char *ptr; - flags = 0; while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { lineno++; if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { @@ -113,7 +122,7 @@ static void readhdr(void) int i; ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("mapsize="), "%" Z "u", &info.me_mapsize); + i = sscanf((char *)dbuf.mv_data+STRLENOF("mapsize="), "%" Y "u", &info.me_mapsize); if (i != 1) { fprintf(stderr, "%s: line %" Z "d: invalid mapsize %s\n", prog, lineno, (char *)dbuf.mv_data+STRLENOF("mapsize=")); @@ -249,8 +258,7 @@ badend: c2 += 2; } } else { - /* copies are redundant when no escapes were used */ - *c1++ = *c2++; + c1++; c2++; } } } else { @@ -278,10 +286,15 @@ badend: static void usage(void) { - fprintf(stderr, "usage: %s [-V] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-a] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); exit(EXIT_FAILURE); } +static int greater(const MDB_val *a, const MDB_val *b) +{ + return 1; +} + int main(int argc, char *argv[]) { int i, rc; @@ -291,7 +304,8 @@ int main(int argc, char *argv[]) MDB_dbi dbi; char *envname; int envflags = 0, putflags = 0; - int dohdr = 0; + int dohdr = 0, append = 0; + MDB_val prevk; prog = argv[0]; @@ -299,19 +313,23 @@ int main(int argc, char *argv[]) usage(); } - /* -f: load file instead of stdin + /* -a: append records in input order + * -f: load file instead of stdin * -n: use NOSUBDIR flag on env_open * -s: load into named subDB * -N: use NOOVERWRITE on puts * -T: read plaintext * -V: print version and exit */ - while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { + while ((i = getopt(argc, argv, "af:ns:NTV")) != EOF) { switch(i) { case 'V': printf("%s\n", MDB_VERSION_STRING); exit(0); break; + case 'a': + append = 1; + break; case 'f': if (freopen(optarg, "r", stdin) == NULL) { fprintf(stderr, "%s: %s: reopen: %s\n", @@ -370,11 +388,17 @@ int main(int argc, char *argv[]) } kbuf.mv_size = mdb_env_get_maxkeysize(env) * 2 + 2; - kbuf.mv_data = malloc(kbuf.mv_size); + kbuf.mv_data = malloc(kbuf.mv_size * 2); + k0buf.mv_size = kbuf.mv_size; + k0buf.mv_data = (char *)kbuf.mv_data + kbuf.mv_size; + prevk.mv_size = 0; + prevk.mv_data = k0buf.mv_data; while(!Eof) { MDB_val key, data; int batch = 0; + flags = 0; + int appflag; if (!dohdr) { dohdr = 1; @@ -392,6 +416,11 @@ int main(int argc, char *argv[]) fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } + if (append) { + mdb_set_compare(txn, dbi, greater); + if (flags & MDB_DUPSORT) + mdb_set_dupsort(txn, dbi, greater); + } rc = mdb_cursor_open(txn, dbi, &mc); if (rc) { @@ -410,7 +439,20 @@ int main(int argc, char *argv[]) goto txn_abort; } - rc = mdb_cursor_put(mc, &key, &data, putflags); + if (append) { + appflag = MDB_APPEND; + if (flags & MDB_DUPSORT) { + if (prevk.mv_size == key.mv_size && !memcmp(prevk.mv_data, key.mv_data, key.mv_size)) + appflag = MDB_APPENDDUP; + else { + memcpy(prevk.mv_data, key.mv_data, key.mv_size); + prevk.mv_size = key.mv_size; + } + } + } else { + appflag = 0; + } + rc = mdb_cursor_put(mc, &key, &data, putflags|appflag); if (rc == MDB_KEYEXIST && putflags) continue; if (rc) { diff --git a/contrib/db/liblmdb/mdb_stat.1 b/contrib/db/liblmdb/mdb_stat.1 index 7c3f2846..bf49bd3b 100644 --- a/contrib/db/liblmdb/mdb_stat.1 +++ b/contrib/db/liblmdb/mdb_stat.1 @@ -1,5 +1,5 @@ -.TH MDB_STAT 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2012-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_STAT 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_stat \- LMDB environment status tool @@ -14,6 +14,8 @@ mdb_stat \- LMDB environment status tool [\c .BR \-n ] [\c +.BR \-v ] +[\c .BR \-r [ r ]] [\c .BR \-a \ | @@ -39,6 +41,10 @@ If \fB\-fff\fP is given, display the full list of page IDs in the freelist. .BR \-n Display the status of an LMDB database which does not use subdirectories. .TP +.BR \-v +Use the previous environment state instead of the latest state. +This may be useful if the latest state has been corrupted. +.TP .BR \-r Display information about the environment reader table. Shows the process ID, thread ID, and transaction ID for each active diff --git a/contrib/db/liblmdb/mdb_stat.c b/contrib/db/liblmdb/mdb_stat.c index f4c0dc1e..30ec81fe 100644 --- a/contrib/db/liblmdb/mdb_stat.c +++ b/contrib/db/liblmdb/mdb_stat.c @@ -1,6 +1,6 @@ /* mdb_stat.c - memory-mapped database status tool */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -22,6 +22,15 @@ #else #define Z "z" #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif static void prstat(MDB_stat *ms) { @@ -29,15 +38,15 @@ static void prstat(MDB_stat *ms) printf(" Page size: %u\n", ms->ms_psize); #endif printf(" Tree depth: %u\n", ms->ms_depth); - printf(" Branch pages: %"Z"u\n", ms->ms_branch_pages); - printf(" Leaf pages: %"Z"u\n", ms->ms_leaf_pages); - printf(" Overflow pages: %"Z"u\n", ms->ms_overflow_pages); - printf(" Entries: %"Z"u\n", ms->ms_entries); + printf(" Branch pages: %"Y"u\n", ms->ms_branch_pages); + printf(" Leaf pages: %"Y"u\n", ms->ms_leaf_pages); + printf(" Overflow pages: %"Y"u\n", ms->ms_overflow_pages); + printf(" Entries: %"Y"u\n", ms->ms_entries); } static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-v] [-a|-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -64,6 +73,7 @@ int main(int argc, char *argv[]) * -f: print freelist info * -r: print reader info * -n: use NOSUBDIR flag on env_open + * -v: use previous snapshot * -V: print version and exit * (default) print stat of only the main DB */ @@ -87,6 +97,9 @@ int main(int argc, char *argv[]) case 'n': envflags |= MDB_NOSUBDIR; break; + case 'v': + envflags |= MDB_PREVSNAPSHOT; + break; case 'r': rdrinfo++; break; @@ -125,11 +138,11 @@ int main(int argc, char *argv[]) (void)mdb_env_info(env, &mei); printf("Environment Info\n"); printf(" Map address: %p\n", mei.me_mapaddr); - printf(" Map size: %"Z"u\n", mei.me_mapsize); + printf(" Map size: %"Y"u\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); - printf(" Max pages: %"Z"u\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %"Z"u\n", mei.me_last_pgno+1); - printf(" Last transaction ID: %"Z"u\n", mei.me_last_txnid); + printf(" Max pages: %"Y"u\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %"Y"u\n", mei.me_last_pgno+1); + printf(" Last transaction ID: %"Y"u\n", mei.me_last_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); } diff --git a/contrib/db/liblmdb/midl.c b/contrib/db/liblmdb/midl.c index 75420c41..152a1ec0 100644 --- a/contrib/db/liblmdb/midl.c +++ b/contrib/db/liblmdb/midl.c @@ -3,8 +3,7 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software . * - * Copyright 2000-2019 The OpenLDAP Foundation. - * Portions Copyright 2001-2018 Howard Chu, Symas Corp. + * Copyright 2000-2015 The OpenLDAP Foundation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -355,5 +354,67 @@ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) return 0; } +#ifdef MDB_VL32 +unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( id, ids[cursor].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id ) +{ + unsigned x, i; + + x = mdb_mid3l_search( ids, id->mid ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + /* insert id */ + ids[0].mid++; + for (i=(unsigned)ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + + return 0; +} +#endif /* MDB_VL32 */ + /** @} */ /** @} */ diff --git a/contrib/db/liblmdb/midl.h b/contrib/db/liblmdb/midl.h index 462c3497..1555ecb1 100644 --- a/contrib/db/liblmdb/midl.h +++ b/contrib/db/liblmdb/midl.h @@ -11,8 +11,7 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software . * - * Copyright 2000-2019 The OpenLDAP Foundation. - * Portions Copyright 2001-2018 Howard Chu, Symas Corp. + * Copyright 2000-2015 The OpenLDAP Foundation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,6 +27,7 @@ #define _MDB_MIDL_H_ #include +#include #ifdef __cplusplus extern "C" { @@ -43,7 +43,11 @@ extern "C" { /** A generic unsigned ID number. These were entryIDs in back-bdb. * Preferably it should have the same size as a pointer. */ +#ifdef MDB_VL32 +typedef uint64_t MDB_ID; +#else typedef size_t MDB_ID; +#endif /** An IDL is an ID List, a sorted array of IDs. The first * element of the array is a counter for how many actual @@ -178,6 +182,20 @@ int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ); */ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ); +#ifdef MDB_VL32 +typedef struct MDB_ID3 { + MDB_ID mid; /**< The ID */ + void *mptr; /**< The pointer */ + unsigned int mcnt; /**< Number of pages */ + unsigned int mref; /**< Refcounter */ +} MDB_ID3; + +typedef MDB_ID3 *MDB_ID3L; + +unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id ); +int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id ); + +#endif /* MDB_VL32 */ /** @} */ /** @} */ #ifdef __cplusplus diff --git a/contrib/db/liblmdb/mtest.c b/contrib/db/liblmdb/mtest.c index 6fc5840c..9d15088b 100644 --- a/contrib/db/liblmdb/mtest.c +++ b/contrib/db/liblmdb/mtest.c @@ -1,6 +1,6 @@ /* mtest.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest2.c b/contrib/db/liblmdb/mtest2.c index 64b742aa..eacbe59d 100644 --- a/contrib/db/liblmdb/mtest2.c +++ b/contrib/db/liblmdb/mtest2.c @@ -1,6 +1,6 @@ /* mtest2.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest3.c b/contrib/db/liblmdb/mtest3.c index 81e4bbf9..9db79e62 100644 --- a/contrib/db/liblmdb/mtest3.c +++ b/contrib/db/liblmdb/mtest3.c @@ -1,6 +1,6 @@ /* mtest3.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest4.c b/contrib/db/liblmdb/mtest4.c index c355cf10..6df890e2 100644 --- a/contrib/db/liblmdb/mtest4.c +++ b/contrib/db/liblmdb/mtest4.c @@ -1,6 +1,6 @@ /* mtest4.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest5.c b/contrib/db/liblmdb/mtest5.c index 95793ec1..14e3c0da 100644 --- a/contrib/db/liblmdb/mtest5.c +++ b/contrib/db/liblmdb/mtest5.c @@ -1,6 +1,6 @@ /* mtest5.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest6.c b/contrib/db/liblmdb/mtest6.c index cb0d4d73..ae3c7f26 100644 --- a/contrib/db/liblmdb/mtest6.c +++ b/contrib/db/liblmdb/mtest6.c @@ -1,6 +1,6 @@ /* mtest6.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/sample-bdb.txt b/contrib/db/liblmdb/sample-bdb.txt index 97220f0e..563807a2 100644 --- a/contrib/db/liblmdb/sample-bdb.txt +++ b/contrib/db/liblmdb/sample-bdb.txt @@ -3,7 +3,7 @@ * Do a line-by-line comparison of this and sample-mdb.txt */ /* - * Copyright 2012-2018 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/sample-mdb.txt b/contrib/db/liblmdb/sample-mdb.txt index 1d20ed3d..10a25687 100644 --- a/contrib/db/liblmdb/sample-mdb.txt +++ b/contrib/db/liblmdb/sample-mdb.txt @@ -3,7 +3,7 @@ * Do a line-by-line comparison of this and sample-bdb.txt */ /* - * Copyright 2012-2018 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/common/db_abstract_accessor.h b/src/common/db_abstract_accessor.h index 9b84c9f8..d74937ee 100644 --- a/src/common/db_abstract_accessor.h +++ b/src/common/db_abstract_accessor.h @@ -241,9 +241,9 @@ namespace tools m_is_open = false; return m_backend->close(); } - bool open(const std::string& path, uint64_t flags = 0) + bool open(const std::string& path, uint64_t cache_sz = CACHE_SIZE) { - bool r = m_backend->open(path, flags); + bool r = m_backend->open(path, cache_sz); if(r) m_is_open = true; @@ -558,10 +558,10 @@ namespace tools bool init(const std::string& container_name) { #ifdef ENABLE_PROFILING - m_get_profiler.m_name = container_name +":get"; - m_set_profiler.m_name = container_name + ":set"; - m_explicit_get_profiler.m_name = container_name + ":explicit_get"; - m_explicit_set_profiler.m_name = container_name + ":explicit_set"; + m_get_profiler.m_name = container_name +":get"; + m_set_profiler.m_name = container_name + ":set"; + m_explicit_get_profiler.m_name = container_name + ":explicit_get"; + m_explicit_set_profiler.m_name = container_name + ":explicit_set"; m_commit_profiler.m_name = container_name + ":commit"; #endif return bdb.get_backend()->open_container(container_name, m_h); diff --git a/src/common/db_backend_base.h b/src/common/db_backend_base.h index e3187c82..538abd62 100644 --- a/src/common/db_backend_base.h +++ b/src/common/db_backend_base.h @@ -5,6 +5,13 @@ #pragma once +#ifndef ENV32BIT +#define CACHE_SIZE uint64_t(uint64_t(1UL * 128UL) * 1024UL * 1024UL * 1024UL) +#else +#define CACHE_SIZE (1 * 1024UL * 1024UL * 1024UL) +#endif + + namespace tools { namespace db @@ -25,20 +32,20 @@ namespace tools struct i_db_backend { - virtual bool close() = 0; + virtual bool close()=0; virtual bool begin_transaction(bool read_only = false) = 0; - virtual bool commit_transaction() = 0; - virtual void abort_transaction() = 0; - virtual bool open(const std::string& path, uint64_t flags = 0) = 0; - virtual bool open_container(const std::string& name, container_handle& h) = 0; + virtual bool commit_transaction()=0; + virtual void abort_transaction()=0; + virtual bool open(const std::string& path, uint64_t cache_sz = CACHE_SIZE) = 0; + virtual bool open_container(const std::string& name, container_handle& h)=0; virtual bool erase(container_handle h, const char* k, size_t s) = 0; virtual uint64_t size(container_handle h) = 0; virtual bool get(container_handle h, const char* k, size_t s, std::string& res_buff) = 0; virtual bool set(container_handle h, const char* k, size_t s, const char* v, size_t vs) = 0; virtual bool clear(container_handle h) = 0; - virtual bool enumerate(container_handle h, i_db_callback* pcb) = 0; + virtual bool enumerate(container_handle h, i_db_callback* pcb)=0; virtual bool get_stat_info(stat_info& si) = 0; - virtual ~i_db_backend() {}; + virtual ~i_db_backend(){}; }; } -} +} \ No newline at end of file diff --git a/src/currency_core/blockchain_storage.cpp b/src/currency_core/blockchain_storage.cpp index 61cf312b..827adc65 100644 --- a/src/currency_core/blockchain_storage.cpp +++ b/src/currency_core/blockchain_storage.cpp @@ -76,6 +76,7 @@ DISABLE_VS_WARNINGS(4267) namespace { + const command_line::arg_descriptor arg_db_cache_l1 = { "db-cache-l1", "Specify size of memory mapped db cache file", 0, true }; const command_line::arg_descriptor arg_db_cache_l2 = { "db-cache-l2", "Specify cached elements in db helpers", 0, true }; } @@ -151,6 +152,7 @@ std::shared_ptr blockchain_storage::get_tx(const crypto::hash &id) //------------------------------------------------------------------ void blockchain_storage::init_options(boost::program_options::options_description& desc) { + command_line::add_arg(desc, arg_db_cache_l1); command_line::add_arg(desc, arg_db_cache_l2); } //------------------------------------------------------------------ @@ -213,20 +215,36 @@ bool blockchain_storage::init(const std::string& config_folder, const boost::pro return false; } + uint64_t cache_size_l1 = CACHE_SIZE; + if (command_line::has_arg(vm, arg_db_cache_l1)) + { + cache_size_l1 = command_line::get_arg(vm, arg_db_cache_l1); + } + LOG_PRINT_GREEN("Using db file cache size(L1): " << cache_size_l1, LOG_LEVEL_0); + m_config_folder = config_folder; + + // remove old incompartible DB + const std::string old_db_folder_path = m_config_folder + "/" CURRENCY_BLOCKCHAINDATA_FOLDERNAME_OLD; + if (boost::filesystem::exists(old_db_folder_path)) + { + LOG_PRINT_YELLOW("Removing old DB in " << old_db_folder_path << "...", LOG_LEVEL_0); + boost::filesystem::remove_all(old_db_folder_path); + } + const std::string db_folder_path = m_config_folder + "/" CURRENCY_BLOCKCHAINDATA_FOLDERNAME; LOG_PRINT_L0("Loading blockchain from " << db_folder_path); bool db_opened_okay = false; for(size_t loading_attempt_no = 0; loading_attempt_no < 2; ++loading_attempt_no) { - bool res = m_db.open(db_folder_path); + bool res = m_db.open(db_folder_path, cache_size_l1); if (!res) { // if DB could not be opened -- try to remove the whole folder and re-open DB LOG_PRINT_YELLOW("Failed to initialize database in folder: " << db_folder_path << ", first attempt", LOG_LEVEL_0); boost::filesystem::remove_all(db_folder_path); - res = m_db.open(db_folder_path); + res = m_db.open(db_folder_path, cache_size_l1); CHECK_AND_ASSERT_MES(res, false, "Failed to initialize database in folder: " << db_folder_path << ", second attempt"); } diff --git a/src/currency_core/currency_config.h b/src/currency_core/currency_config.h index cb9b0dcb..11d2ec06 100644 --- a/src/currency_core/currency_config.h +++ b/src/currency_core/currency_config.h @@ -187,8 +187,10 @@ #define CURRENCY_CORE_INSTANCE_LOCK_FILE "lock.lck" -#define CURRENCY_POOLDATA_FOLDERNAME "poolstate" -#define CURRENCY_BLOCKCHAINDATA_FOLDERNAME "blockchain" +#define CURRENCY_POOLDATA_FOLDERNAME_OLD "poolstate" +#define CURRENCY_BLOCKCHAINDATA_FOLDERNAME_OLD "blockchain" +#define CURRENCY_POOLDATA_FOLDERNAME "poolstate_lmdb_v1" +#define CURRENCY_BLOCKCHAINDATA_FOLDERNAME "blockchain_lmdb_v1" #define P2P_NET_DATA_FILENAME "p2pstate.bin" #define MINER_CONFIG_FILENAME "miner_conf.json" #define GUI_SECURE_CONFIG_FILENAME "gui_secure_conf.bin" diff --git a/src/currency_core/tx_pool.cpp b/src/currency_core/tx_pool.cpp index 99838ead..6803ac16 100644 --- a/src/currency_core/tx_pool.cpp +++ b/src/currency_core/tx_pool.cpp @@ -1120,19 +1120,30 @@ namespace currency { m_config_folder = config_folder; - LOG_PRINT_L0("Loading blockchain..."); + uint64_t cache_size_l1 = CACHE_SIZE; + LOG_PRINT_GREEN("Using pool db file cache size(L1): " << cache_size_l1, LOG_LEVEL_0); + + // remove old incompartible DB + const std::string old_db_folder_path = m_config_folder + "/" CURRENCY_POOLDATA_FOLDERNAME_OLD; + if (boost::filesystem::exists(old_db_folder_path)) + { + LOG_PRINT_YELLOW("Removing old DB in " << old_db_folder_path << "...", LOG_LEVEL_0); + boost::filesystem::remove_all(old_db_folder_path); + } + const std::string db_folder_path = m_config_folder + "/" CURRENCY_POOLDATA_FOLDERNAME; - + LOG_PRINT_L0("Loading blockchain from " << db_folder_path << "..."); + bool db_opened_okay = false; for(size_t loading_attempt_no = 0; loading_attempt_no < 2; ++loading_attempt_no) { - bool res = m_db.open(db_folder_path); + bool res = m_db.open(db_folder_path, cache_size_l1); if (!res) { // if DB could not be opened -- try to remove the whole folder and re-open DB LOG_PRINT_YELLOW("Failed to initialize database in folder: " << db_folder_path << ", first attempt", LOG_LEVEL_0); boost::filesystem::remove_all(db_folder_path); - res = m_db.open(db_folder_path); + res = m_db.open(db_folder_path, cache_size_l1); CHECK_AND_ASSERT_MES(res, false, "Failed to initialize database in folder: " << db_folder_path << ", second attempt"); } diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 9851a677..65b58cb1 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -100,7 +100,7 @@ int main(int argc, char* argv[]) #endif log_space::get_set_log_detalisation_level(true, LOG_LEVEL_2); log_space::log_singletone::add_logger(LOGGER_CONSOLE, NULL, NULL); - log_space::log_singletone::enable_channels("core,currency_protocol,tx_pool,wallet,lmdb"); + log_space::log_singletone::enable_channels("core,currency_protocol,tx_pool,wallet"); LOG_PRINT_L0("Starting..."); tools::signal_handler::install_fatal([](int sig_number, void* address) { diff --git a/tests/unit_tests/db_accessors.cpp b/tests/unit_tests/db_accessors.cpp index 078f1ac5..826eb0a6 100644 --- a/tests/unit_tests/db_accessors.cpp +++ b/tests/unit_tests/db_accessors.cpp @@ -29,8 +29,9 @@ TEST(db_accessor_tests, cached_key_value_accessor_test) tools::db::cached_key_value_accessor m_container(m_db); const std::string folder_name = "./TEST_cached_key_value_accessor_test"; tools::create_directories_if_necessary(folder_name); + uint64_t cache_size = CACHE_SIZE; - ASSERT_TRUE(m_db.open(folder_name)); + ASSERT_TRUE(m_db.open(folder_name, cache_size)); ASSERT_TRUE(m_container.init("container")); ... TODO ... @@ -46,7 +47,8 @@ TEST(db_accessor_tests_2, recoursive_tx_test) const std::string folder_name = "./TEST_db_recursive_tx"; tools::create_directories_if_necessary(folder_name); - ASSERT_TRUE(m_db.open(folder_name)); + uint64_t cache_size = CACHE_SIZE; + ASSERT_TRUE(m_db.open(folder_name, cache_size)); ASSERT_TRUE(m_container.init("zzzz") ); bool tx_result = m_container.begin_transaction(); @@ -317,7 +319,8 @@ TEST(db_accessor_tests, median_db_cache_test) const std::string folder_name = "./TEST_median_db_cache"; const std::string naive_median_serialization_filename = folder_name + "/naive_median"; tools::create_directories_if_necessary(folder_name); - ASSERT_TRUE(m_db.open(folder_name)); + uint64_t cache_size = CACHE_SIZE; + ASSERT_TRUE(m_db.open(folder_name, cache_size)); ASSERT_TRUE(m_tx_fee_median.init("median_fee")); m_db.begin_transaction(); diff --git a/tests/unit_tests/lmdb_tests.cpp b/tests/unit_tests/lmdb_tests.cpp index 7e03cf13..10dc1229 100644 --- a/tests/unit_tests/lmdb_tests.cpp +++ b/tests/unit_tests/lmdb_tests.cpp @@ -30,7 +30,7 @@ namespace lmdb_test // write data // - r = bdba.open(db_file_path); + r = bdba.open(db_file_path, CACHE_SIZE); ASSERT_TRUE(r); db::container_handle h;